From 69d2a7b2e97e2a707a4475dbb0fc586c3561855f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 20 Nov 2023 15:40:52 -0500 Subject: [PATCH] Committing in initial version of code. --- .devcontainer/Dockerfile | 30 + .devcontainer/README.md | 64 + .../cuda11.8-conda/devcontainer.json | 37 + .devcontainer/cuda11.8-pip/devcontainer.json | 38 + .../cuda12.0-conda/devcontainer.json | 37 + .devcontainer/cuda12.0-pip/devcontainer.json | 38 + .flake8 | 24 + .github/CODEOWNERS | 21 + .github/ISSUE_TEMPLATE/bug_report.md | 26 + .../ISSUE_TEMPLATE/documentation-request.md | 35 + .github/ISSUE_TEMPLATE/feature_request.md | 20 + .github/ISSUE_TEMPLATE/submit-question.md | 10 + .github/PULL_REQUEST_TEMPLATE.md | 44 + .github/copy-pr-bot.yaml | 4 + .github/labeler.yml | 16 + .github/ops-bot.yaml | 8 + .github/workflows/build.yaml | 88 + .github/workflows/labeler.yml | 11 + .github/workflows/pr.yaml | 96 + .github/workflows/test.yaml | 51 + .gitignore | 67 + .pre-commit-config.yaml | 112 + CHANGELOG.md | 0 LICENSE | 201 ++ VERSION | 1 + build.sh | 526 ++++ ci/build_cpp.sh | 18 + ci/build_docs.sh | 48 + ci/build_python.sh | 53 + ci/build_wheel.sh | 52 + ci/build_wheel_cuvs.sh | 9 + ci/check_style.sh | 18 + ci/checks/black_lists.sh | 61 + ci/checks/copyright.py | 289 ++ ci/release/update-version.sh | 101 + ci/test_cpp.sh | 43 + ci/test_python.sh | 56 + ci/test_wheel_cuvs.sh | 18 + ci/wheel_smoke_test_cuvs.py | 53 + .../all_cuda-118_arch-aarch64.yaml | 54 + .../all_cuda-118_arch-x86_64.yaml | 54 + .../all_cuda-120_arch-aarch64.yaml | 50 + .../all_cuda-120_arch-x86_64.yaml | 50 + .../bench_ann_cuda-118_arch-aarch64.yaml | 44 + .../bench_ann_cuda-118_arch-x86_64.yaml | 44 + .../bench_ann_cuda-120_arch-aarch64.yaml | 40 + .../bench_ann_cuda-120_arch-x86_64.yaml | 40 + conda/recipes/cuda-ann-bench-cpu/build.sh | 5 + .../conda_build_config.yaml | 20 + conda/recipes/cuda-ann-bench-cpu/meta.yaml | 66 + conda/recipes/cuda-ann-bench/build.sh | 5 + .../cuda-ann-bench/conda_build_config.yaml | 70 + conda/recipes/cuda-ann-bench/meta.yaml | 104 + conda/recipes/cuvs/build.sh | 5 + conda/recipes/cuvs/conda_build_config.yaml | 17 + conda/recipes/cuvs/meta.yaml | 76 + conda/recipes/libcuvs/build_libcuvs.sh | 4 + conda/recipes/libcuvs/build_libcuvs_static.sh | 5 + .../recipes/libcuvs/build_libcuvs_template.sh | 5 + conda/recipes/libcuvs/build_libraft_tests.sh | 5 + conda/recipes/libcuvs/conda_build_config.yaml | 73 + conda/recipes/libcuvs/meta.yaml | 173 ++ cpp/.clang-format | 155 + cpp/.clang-tidy | 229 ++ cpp/.clangd | 65 + cpp/CMakeLists.txt | 741 +++++ cpp/bench/ann/CMakeLists.txt | 380 +++ cpp/bench/ann/README.md | 3 + cpp/bench/ann/src/common/ann_types.hpp | 138 + cpp/bench/ann/src/common/benchmark.cpp | 109 + cpp/bench/ann/src/common/benchmark.hpp | 714 +++++ cpp/bench/ann/src/common/conf.hpp | 156 + .../src/common/cuda_huge_page_resource.hpp | 132 + .../ann/src/common/cuda_pinned_resource.hpp | 130 + cpp/bench/ann/src/common/cuda_stub.hpp | 234 ++ cpp/bench/ann/src/common/dataset.hpp | 501 ++++ cpp/bench/ann/src/common/thread_pool.hpp | 133 + cpp/bench/ann/src/common/util.hpp | 348 +++ .../ann/src/faiss/faiss_cpu_benchmark.cpp | 162 ++ cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h | 313 ++ .../ann/src/faiss/faiss_gpu_benchmark.cu | 162 ++ cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h | 435 +++ cpp/bench/ann/src/ggnn/ggnn_benchmark.cu | 127 + cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh | 294 ++ .../ann/src/hnswlib/hnswlib_benchmark.cpp | 120 + cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h | 230 ++ .../src/raft/raft_ann_bench_param_parser.h | 252 ++ cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 44 + cpp/bench/ann/src/raft/raft_benchmark.cu | 140 + cpp/bench/ann/src/raft/raft_cagra.cu | 22 + cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu | 95 + .../ann/src/raft/raft_cagra_hnswlib_wrapper.h | 120 + cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 286 ++ cpp/bench/ann/src/raft/raft_ivf_flat.cu | 22 + .../ann/src/raft/raft_ivf_flat_wrapper.h | 137 + cpp/bench/ann/src/raft/raft_ivf_pq.cu | 22 + cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h | 221 ++ cpp/bench/ann/src/raft/raft_wrapper.h | 153 + cpp/bench/prims/CMakeLists.txt | 166 ++ cpp/bench/prims/cluster/kmeans.cu | 124 + cpp/bench/prims/cluster/kmeans_balanced.cu | 99 + cpp/bench/prims/common/benchmark.hpp | 356 +++ cpp/bench/prims/core/bitset.cu | 74 + cpp/bench/prims/core/copy.cu | 401 +++ cpp/bench/prims/distance/distance_common.cuh | 92 + cpp/bench/prims/distance/distance_cosine.cu | 23 + cpp/bench/prims/distance/distance_exp_l2.cu | 24 + cpp/bench/prims/distance/distance_l1.cu | 23 + cpp/bench/prims/distance/distance_unexp_l2.cu | 24 + cpp/bench/prims/distance/fused_l2_nn.cu | 161 ++ cpp/bench/prims/distance/kernels.cu | 120 + cpp/bench/prims/distance/masked_nn.cu | 263 ++ .../prims/distance/tune_pairwise/bench.cu | 151 + .../prims/distance/tune_pairwise/kernel.cu | 88 + .../prims/distance/tune_pairwise/kernel.cuh | 44 + cpp/bench/prims/linalg/add.cu | 51 + cpp/bench/prims/linalg/map_then_reduce.cu | 64 + cpp/bench/prims/linalg/matrix_vector_op.cu | 156 + cpp/bench/prims/linalg/norm.cu | 85 + cpp/bench/prims/linalg/normalize.cu | 79 + cpp/bench/prims/linalg/reduce.cu | 62 + cpp/bench/prims/linalg/reduce_cols_by_key.cu | 78 + cpp/bench/prims/linalg/reduce_rows_by_key.cu | 88 + cpp/bench/prims/main.cpp | 19 + cpp/bench/prims/matrix/argmin.cu | 67 + cpp/bench/prims/matrix/gather.cu | 102 + cpp/bench/prims/matrix/main.cpp | 41 + cpp/bench/prims/matrix/select_k.cu | 342 +++ cpp/bench/prims/neighbors/cagra_bench.cuh | 206 ++ cpp/bench/prims/neighbors/knn.cuh | 509 ++++ .../knn/brute_force_float_int64_t.cu | 23 + .../knn/brute_force_float_uint32_t.cu | 23 + .../neighbors/knn/cagra_float_uint32_t.cu | 23 + .../knn/ivf_flat_filter_float_int64_t.cu | 24 + .../neighbors/knn/ivf_flat_float_int64_t.cu | 23 + .../neighbors/knn/ivf_flat_int8_t_int64_t.cu | 23 + .../neighbors/knn/ivf_flat_uint8_t_int64_t.cu | 23 + .../knn/ivf_pq_filter_float_int64_t.cu | 24 + .../neighbors/knn/ivf_pq_float_int64_t.cu | 23 + .../neighbors/knn/ivf_pq_int8_t_int64_t.cu | 23 + .../neighbors/knn/ivf_pq_uint8_t_int64_t.cu | 23 + cpp/bench/prims/neighbors/refine.cuh | 109 + .../prims/neighbors/refine_float_int64_t.cu | 25 + .../prims/neighbors/refine_uint8_t_int64_t.cu | 25 + cpp/bench/prims/random/make_blobs.cu | 87 + cpp/bench/prims/random/permute.cu | 79 + cpp/bench/prims/random/rng.cu | 120 + cpp/bench/prims/sparse/convert_csr.cu | 137 + cpp/cmake/config.json | 43 + cpp/cmake/modules/ConfigureCUDA.cmake | 57 + cpp/cmake/modules/FindAVX.cmake | 110 + cpp/cmake/patches/ggnn.patch | 229 ++ cpp/cmake/patches/hnswlib.patch | 130 + cpp/cmake/patches/nlohmann_json.patch | 38 + cpp/cmake/thirdparty/get_cutlass.cmake | 92 + cpp/cmake/thirdparty/get_faiss.cmake | 110 + cpp/cmake/thirdparty/get_fmt.cmake | 22 + cpp/cmake/thirdparty/get_ggnn.cmake | 44 + cpp/cmake/thirdparty/get_glog.cmake | 48 + cpp/cmake/thirdparty/get_gtest.cmake | 22 + cpp/cmake/thirdparty/get_hnswlib.cmake | 54 + cpp/cmake/thirdparty/get_nlohmann_json.cmake | 39 + cpp/cmake/thirdparty/get_rmm.cmake | 23 + cpp/cmake/thirdparty/get_spdlog.cmake | 33 + cpp/cmake/thirdparty/get_thrust.cmake | 24 + cpp/doxygen/Doxyfile | 2546 +++++++++++++++++ cpp/doxygen/header.html | 62 + cpp/doxygen/main_page.md | 22 + .../cuvs/cluster/detail/agglomerative.cuh | 328 +++ .../cuvs/cluster/detail/connectivities.cuh | 236 ++ cpp/include/cuvs/cluster/detail/kmeans.cuh | 1254 ++++++++ .../cluster/detail/kmeans_auto_find_k.cuh | 233 ++ .../cuvs/cluster/detail/kmeans_balanced.cuh | 1097 +++++++ .../cuvs/cluster/detail/kmeans_common.cuh | 663 +++++ .../cuvs/cluster/detail/kmeans_deprecated.cuh | 1001 +++++++ cpp/include/cuvs/cluster/detail/mst.cuh | 207 ++ .../cuvs/cluster/detail/single_linkage.cuh | 125 + cpp/include/cuvs/cluster/kmeans.cuh | 1116 ++++++++ cpp/include/cuvs/cluster/kmeans_balanced.cuh | 366 +++ .../cuvs/cluster/kmeans_balanced_types.hpp | 47 + .../cuvs/cluster/kmeans_deprecated.cuh | 65 + cpp/include/cuvs/cluster/kmeans_types.hpp | 122 + cpp/include/cuvs/cluster/single_linkage.cuh | 112 + .../cuvs/cluster/single_linkage_types.hpp | 83 + cpp/include/cuvs/cluster/specializations.cuh | 22 + .../cuvs/distance/detail/compress_to_bits.cuh | 123 + cpp/include/cuvs/distance/detail/distance.cuh | 814 ++++++ .../distance/detail/distance_ops/all_ops.cuh | 35 + .../distance/detail/distance_ops/canberra.cuh | 71 + .../detail/distance_ops/correlation.cuh | 126 + .../distance/detail/distance_ops/cosine.cuh | 85 + .../distance/detail/distance_ops/cutlass.cuh | 40 + .../distance/detail/distance_ops/hamming.cuh | 73 + .../detail/distance_ops/hellinger.cuh | 77 + .../detail/distance_ops/jensen_shannon.cuh | 81 + .../detail/distance_ops/kl_divergence.cuh | 99 + .../cuvs/distance/detail/distance_ops/l1.cuh | 62 + .../distance/detail/distance_ops/l2_exp.cuh | 136 + .../distance/detail/distance_ops/l2_unexp.cuh | 79 + .../distance/detail/distance_ops/l_inf.cuh | 67 + .../distance/detail/distance_ops/lp_unexp.cuh | 78 + .../detail/distance_ops/russel_rao.cuh | 74 + .../distance/detail/distance_ops/template.cuh | 68 + .../custom_epilogue_with_broadcast.h | 671 +++++ .../detail/fused_distance_nn/cutlass_base.cuh | 161 ++ .../detail/fused_distance_nn/epilogue.cuh | 136 + .../epilogue_elementwise.cuh | 216 ++ .../distance/detail/fused_distance_nn/gemm.h | 410 +++ .../fused_distance_nn/persistent_gemm.h | 515 ++++ .../predicated_tile_iterator_normvec_smem.h | 448 +++ .../predicated_tile_iterator_reduced_vec.h | 626 ++++ .../cuvs/distance/detail/fused_l2_nn.cuh | 385 +++ .../distance/detail/kernels/gram_matrix.cuh | 489 ++++ .../detail/kernels/kernel_factory.cuh | 64 + .../detail/kernels/kernel_matrices.cuh | 777 +++++ .../distance/detail/kernels/rbf_fin_op.cuh | 51 + .../distance/detail/masked_distance_base.cuh | 326 +++ .../cuvs/distance/detail/masked_nn.cuh | 327 +++ .../detail/pairwise_distance_base.cuh | 326 +++ .../detail/pairwise_distance_cutlass_base.cuh | 172 ++ .../detail/pairwise_distance_epilogue.h | 101 + .../pairwise_distance_epilogue_elementwise.h | 171 ++ .../distance/detail/pairwise_distance_gemm.h | 239 ++ .../detail/pairwise_matrix/dispatch-ext.cuh | 194 ++ .../detail/pairwise_matrix/dispatch-inl.cuh | 127 + .../detail/pairwise_matrix/dispatch.cuh | 24 + .../pairwise_matrix/dispatch_layout.cuh | 116 + .../detail/pairwise_matrix/dispatch_sm60.cuh | 84 + .../detail/pairwise_matrix/dispatch_sm80.cuh | 68 + .../detail/pairwise_matrix/kernel_sm60.cuh | 155 + .../detail/pairwise_matrix/params.cuh | 47 + .../detail/predicated_tile_iterator_normvec.h | 585 ++++ cpp/include/cuvs/distance/distance-ext.cuh | 1065 +++++++ cpp/include/cuvs/distance/distance-inl.cuh | 477 +++ cpp/include/cuvs/distance/distance.cuh | 24 + cpp/include/cuvs/distance/distance_types.hpp | 108 + cpp/include/cuvs/distance/fused_l2_nn-ext.cuh | 82 + cpp/include/cuvs/distance/fused_l2_nn-inl.cuh | 206 ++ cpp/include/cuvs/distance/fused_l2_nn.cuh | 24 + .../cuvs/distance/fused_l2_nn_helpers.cuh | 50 + cpp/include/cuvs/distance/kernels.cuh | 32 + cpp/include/cuvs/distance/masked_nn.cuh | 199 ++ cpp/include/cuvs/distance/specializations.cuh | 22 + .../distance/specializations/distance.cuh | 22 + .../specializations/fused_l2_nn_min.cuh | 22 + cpp/include/cuvs/neighbors/ann_types.hpp | 52 + cpp/include/cuvs/neighbors/ball_cover-ext.cuh | 124 + cpp/include/cuvs/neighbors/ball_cover-inl.cuh | 395 +++ cpp/include/cuvs/neighbors/ball_cover.cuh | 24 + .../cuvs/neighbors/ball_cover_types.hpp | 169 ++ .../cuvs/neighbors/brute_force-ext.cuh | 148 + .../cuvs/neighbors/brute_force-inl.cuh | 354 +++ cpp/include/cuvs/neighbors/brute_force.cuh | 92 + .../cuvs/neighbors/brute_force_types.hpp | 282 ++ cpp/include/cuvs/neighbors/cagra.cuh | 418 +++ .../cuvs/neighbors/cagra_serialize.cuh | 231 ++ cpp/include/cuvs/neighbors/cagra_types.hpp | 363 +++ .../cuvs/neighbors/detail/cagra/bitonic.hpp | 226 ++ .../neighbors/detail/cagra/cagra_build.cuh | 349 +++ .../neighbors/detail/cagra/cagra_search.cuh | 193 ++ .../detail/cagra/cagra_serialize.cuh | 281 ++ .../detail/cagra/compute_distance.hpp | 260 ++ .../neighbors/detail/cagra/device_common.hpp | 52 + .../cuvs/neighbors/detail/cagra/factory.cuh | 97 + .../cuvs/neighbors/detail/cagra/fragment.hpp | 211 ++ .../neighbors/detail/cagra/graph_core.cuh | 573 ++++ .../cuvs/neighbors/detail/cagra/hashmap.hpp | 79 + .../detail/cagra/search_multi_cta.cuh | 255 ++ .../cagra/search_multi_cta_kernel-ext.cuh | 114 + .../cagra/search_multi_cta_kernel-inl.cuh | 530 ++++ .../detail/cagra/search_multi_cta_kernel.cuh | 24 + .../detail/cagra/search_multi_kernel.cuh | 862 ++++++ .../neighbors/detail/cagra/search_plan.cuh | 330 +++ .../detail/cagra/search_single_cta.cuh | 247 ++ .../cagra/search_single_cta_kernel-ext.cuh | 119 + .../cagra/search_single_cta_kernel-inl.cuh | 956 +++++++ .../detail/cagra/search_single_cta_kernel.cuh | 24 + .../neighbors/detail/cagra/topk_by_radix.cuh | 91 + .../detail/cagra/topk_for_cagra/topk.h | 58 + .../detail/cagra/topk_for_cagra/topk_core.cuh | 1038 +++++++ .../cuvs/neighbors/detail/cagra/utils.hpp | 282 ++ .../cuvs/neighbors/detail/div_utils.hpp | 66 + .../detail/faiss_select/Comparators.cuh | 29 + .../detail/faiss_select/DistanceUtils.h | 52 + .../detail/faiss_select/MergeNetworkBlock.cuh | 276 ++ .../detail/faiss_select/MergeNetworkUtils.cuh | 25 + .../detail/faiss_select/MergeNetworkWarp.cuh | 520 ++++ .../neighbors/detail/faiss_select/Select.cuh | 554 ++++ .../detail/faiss_select/StaticUtils.h | 48 + .../faiss_select/key_value_block_select.cuh | 224 ++ .../cuvs/neighbors/detail/ivf_flat_build.cuh | 493 ++++ .../detail/ivf_flat_interleaved_scan-ext.cuh | 75 + .../detail/ivf_flat_interleaved_scan-inl.cuh | 1122 ++++++++ .../detail/ivf_flat_interleaved_scan.cuh | 25 + .../neighbors/detail/ivf_flat_search-ext.cuh | 64 + .../neighbors/detail/ivf_flat_search-inl.cuh | 260 ++ .../cuvs/neighbors/detail/ivf_flat_search.cuh | 24 + .../neighbors/detail/ivf_flat_serialize.cuh | 174 ++ .../cuvs/neighbors/detail/ivf_pq_build.cuh | 1918 +++++++++++++ .../neighbors/detail/ivf_pq_codepacking.cuh | 214 ++ .../detail/ivf_pq_compute_similarity-ext.cuh | 218 ++ .../detail/ivf_pq_compute_similarity-inl.cuh | 941 ++++++ .../detail/ivf_pq_compute_similarity.cuh | 25 + .../detail/ivf_pq_dummy_block_sort.cuh | 39 + .../cuvs/neighbors/detail/ivf_pq_fp_8bit.cuh | 128 + .../cuvs/neighbors/detail/ivf_pq_search.cuh | 860 ++++++ .../neighbors/detail/ivf_pq_serialize.cuh | 191 ++ .../cuvs/neighbors/detail/knn_brute_force.cuh | 550 ++++ .../detail/knn_brute_force_batch_k_query.cuh | 98 + .../cuvs/neighbors/detail/knn_merge_parts.cuh | 172 ++ .../cuvs/neighbors/detail/nn_descent.cuh | 1454 ++++++++++ cpp/include/cuvs/neighbors/detail/refine.cuh | 19 + .../cuvs/neighbors/detail/refine_common.hpp | 57 + .../cuvs/neighbors/detail/refine_device.cuh | 109 + .../cuvs/neighbors/detail/refine_host-ext.hpp | 55 + .../cuvs/neighbors/detail/refine_host-inl.hpp | 139 + .../cuvs/neighbors/detail/refine_host.hpp | 24 + .../neighbors/detail/selection_faiss-ext.cuh | 67 + .../neighbors/detail/selection_faiss-inl.cuh | 163 ++ .../cuvs/neighbors/detail/selection_faiss.cuh | 24 + .../detail/selection_faiss_helpers.cuh | 31 + .../cuvs/neighbors/epsilon_neighborhood.cuh | 123 + cpp/include/cuvs/neighbors/ivf_flat-ext.cuh | 206 ++ cpp/include/cuvs/neighbors/ivf_flat-inl.cuh | 602 ++++ cpp/include/cuvs/neighbors/ivf_flat.cuh | 24 + .../cuvs/neighbors/ivf_flat_codepacker.hpp | 90 + .../cuvs/neighbors/ivf_flat_helpers.cuh | 145 + .../cuvs/neighbors/ivf_flat_serialize.cuh | 154 + cpp/include/cuvs/neighbors/ivf_flat_types.hpp | 400 +++ cpp/include/cuvs/neighbors/ivf_list.hpp | 194 ++ cpp/include/cuvs/neighbors/ivf_list_types.hpp | 79 + cpp/include/cuvs/neighbors/ivf_pq-ext.cuh | 226 ++ cpp/include/cuvs/neighbors/ivf_pq-inl.cuh | 529 ++++ cpp/include/cuvs/neighbors/ivf_pq.cuh | 24 + cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh | 793 +++++ .../cuvs/neighbors/ivf_pq_serialize.cuh | 146 + cpp/include/cuvs/neighbors/ivf_pq_types.hpp | 577 ++++ .../cuvs/neighbors/neighbors_types.hpp | 63 + cpp/include/cuvs/neighbors/nn_descent.cuh | 181 ++ .../cuvs/neighbors/nn_descent_types.hpp | 147 + cpp/include/cuvs/neighbors/refine-ext.cuh | 78 + cpp/include/cuvs/neighbors/refine-inl.cuh | 104 + cpp/include/cuvs/neighbors/refine.cuh | 24 + cpp/include/cuvs/neighbors/sample_filter.cuh | 49 + .../cuvs/neighbors/sample_filter_types.hpp | 175 ++ .../cuvs/neighbors/specializations.cuh | 22 + .../neighbors/specializations/ball_cover.cuh | 22 + .../neighbors/specializations/brute_force.cuh | 22 + .../detail/ball_cover_lowdim.hpp | 85 + .../detail/ivf_pq_compute_similarity.cuh | 22 + .../specializations/fused_l2_knn.cuh | 22 + .../neighbors/specializations/ivf_flat.cuh | 22 + .../cuvs/neighbors/specializations/ivf_pq.cuh | 22 + .../cuvs/neighbors/specializations/refine.cuh | 22 + cpp/include/cuvs/spatial/knn/ann.cuh | 83 + cpp/include/cuvs/spatial/knn/ann_common.h | 103 + cpp/include/cuvs/spatial/knn/ann_types.hpp | 45 + cpp/include/cuvs/spatial/knn/ball_cover.cuh | 70 + .../cuvs/spatial/knn/ball_cover_types.hpp | 37 + cpp/include/cuvs/spatial/knn/common.hpp | 23 + .../cuvs/spatial/knn/detail/ann_quantized.cuh | 147 + .../cuvs/spatial/knn/detail/ann_utils.cuh | 576 ++++ .../cuvs/spatial/knn/detail/ball_cover.cuh | 545 ++++ .../spatial/knn/detail/ball_cover/common.cuh | 73 + .../knn/detail/ball_cover/registers-ext.cuh | 129 + .../knn/detail/ball_cover/registers-inl.cuh | 794 +++++ .../knn/detail/ball_cover/registers.cuh | 24 + .../knn/detail/ball_cover/registers_types.cuh | 66 + .../knn/detail/epsilon_neighborhood.cuh | 241 ++ .../spatial/knn/detail/fused_l2_knn-ext.cuh | 74 + .../spatial/knn/detail/fused_l2_knn-inl.cuh | 1062 +++++++ .../cuvs/spatial/knn/detail/fused_l2_knn.cuh | 24 + .../spatial/knn/detail/haversine_distance.cuh | 143 + .../cuvs/spatial/knn/detail/processing.cuh | 189 ++ .../cuvs/spatial/knn/detail/processing.hpp | 45 + .../cuvs/spatial/knn/epsilon_neighborhood.cuh | 38 + cpp/include/cuvs/spatial/knn/ivf_flat.cuh | 39 + .../cuvs/spatial/knn/ivf_flat_types.hpp | 40 + cpp/include/cuvs/spatial/knn/ivf_pq.cuh | 39 + cpp/include/cuvs/spatial/knn/ivf_pq_types.hpp | 40 + cpp/include/cuvs/spatial/knn/knn.cuh | 230 ++ .../cuvs/spatial/knn/specializations.cuh | 22 + .../cuvs/spatial/knn/specializations/knn.cuh | 22 + cpp/include/cuvs/spectral/cluster_solvers.cuh | 99 + .../spectral/cluster_solvers_deprecated.cuh | 89 + cpp/include/cuvs/spectral/detail/lapack.hpp | 574 ++++ .../cuvs/spectral/detail/matrix_wrappers.hpp | 465 +++ .../detail/modularity_maximization.hpp | 171 ++ .../cuvs/spectral/detail/partition.hpp | 185 ++ .../cuvs/spectral/detail/spectral_util.cuh | 257 ++ cpp/include/cuvs/spectral/detail/warn_dbg.hpp | 37 + cpp/include/cuvs/spectral/eigen_solvers.cuh | 107 + cpp/include/cuvs/spectral/matrix_wrappers.hpp | 49 + .../cuvs/spectral/modularity_maximization.cuh | 86 + cpp/include/cuvs/spectral/partition.cuh | 95 + cpp/include/cuvs/spectral/specializations.cuh | 22 + cpp/include/cuvs/stats/accuracy.cuh | 78 + .../cuvs/stats/adjusted_rand_index.cuh | 89 + cpp/include/cuvs/stats/completeness_score.cuh | 91 + cpp/include/cuvs/stats/contingency_matrix.cuh | 217 ++ cpp/include/cuvs/stats/cov.cuh | 122 + .../cuvs/stats/detail/adjusted_rand_index.cuh | 201 ++ .../detail/batched/information_criterion.cuh | 74 + .../stats/detail/batched/silhouette_score.cuh | 278 ++ .../cuvs/stats/detail/contingencyMatrix.cuh | 316 ++ cpp/include/cuvs/stats/detail/cov.cuh | 96 + cpp/include/cuvs/stats/detail/dispersion.cuh | 138 + cpp/include/cuvs/stats/detail/entropy.cuh | 154 + cpp/include/cuvs/stats/detail/histogram.cuh | 496 ++++ .../cuvs/stats/detail/homogeneity_score.cuh | 71 + .../cuvs/stats/detail/kl_divergence.cuh | 84 + cpp/include/cuvs/stats/detail/mean.cuh | 87 + cpp/include/cuvs/stats/detail/mean_center.cuh | 85 + cpp/include/cuvs/stats/detail/meanvar.cuh | 230 ++ cpp/include/cuvs/stats/detail/minmax.cuh | 238 ++ .../cuvs/stats/detail/mutual_info_score.cuh | 179 ++ .../cuvs/stats/detail/neighborhood_recall.cuh | 115 + cpp/include/cuvs/stats/detail/rand_index.cuh | 167 ++ cpp/include/cuvs/stats/detail/scores.cuh | 217 ++ .../cuvs/stats/detail/silhouette_score.cuh | 320 +++ cpp/include/cuvs/stats/detail/stddev.cuh | 182 ++ cpp/include/cuvs/stats/detail/sum.cuh | 84 + .../stats/detail/trustworthiness_score.cuh | 220 ++ cpp/include/cuvs/stats/detail/v_measure.cuh | 64 + .../cuvs/stats/detail/weighted_mean.cuh | 75 + cpp/include/cuvs/stats/dispersion.cuh | 133 + cpp/include/cuvs/stats/entropy.cuh | 86 + cpp/include/cuvs/stats/histogram.cuh | 121 + cpp/include/cuvs/stats/homogeneity_score.cuh | 94 + .../cuvs/stats/information_criterion.cuh | 118 + cpp/include/cuvs/stats/kl_divergence.cuh | 82 + cpp/include/cuvs/stats/mean.cuh | 99 + cpp/include/cuvs/stats/mean_center.cuh | 166 ++ cpp/include/cuvs/stats/meanvar.cuh | 112 + cpp/include/cuvs/stats/minmax.cuh | 144 + cpp/include/cuvs/stats/mutual_info_score.cuh | 92 + .../cuvs/stats/neighborhood_recall.cuh | 194 ++ cpp/include/cuvs/stats/r2_score.cuh | 93 + cpp/include/cuvs/stats/rand_index.cuh | 78 + cpp/include/cuvs/stats/regression_metrics.cuh | 107 + cpp/include/cuvs/stats/silhouette_score.cuh | 226 ++ cpp/include/cuvs/stats/specializations.cuh | 22 + cpp/include/cuvs/stats/stats_types.hpp | 76 + cpp/include/cuvs/stats/stddev.cuh | 188 ++ cpp/include/cuvs/stats/sum.cuh | 91 + .../cuvs/stats/trustworthiness_score.cuh | 101 + cpp/include/cuvs/stats/v_measure.cuh | 98 + cpp/include/cuvs/stats/weighted_mean.cuh | 192 ++ cpp/include/cuvs_runtime/cluster/kmeans.hpp | 97 + .../cuvs_runtime/distance/fused_l2_nn.hpp | 65 + .../distance/pairwise_distance.hpp | 50 + cpp/include/cuvs_runtime/matrix/select_k.hpp | 32 + .../cuvs_runtime/neighbors/brute_force.hpp | 38 + cpp/include/cuvs_runtime/neighbors/cagra.hpp | 93 + .../cuvs_runtime/neighbors/ivf_flat.hpp | 83 + cpp/include/cuvs_runtime/neighbors/ivf_pq.hpp | 96 + cpp/include/cuvs_runtime/neighbors/refine.hpp | 48 + cpp/internal/CMakeLists.txt | 21 + .../cuvs_internal/matrix/select_k.cuh | 175 ++ .../cuvs_internal/neighbors/naive_knn.cuh | 128 + .../cuvs_internal/neighbors/refine_helper.cuh | 158 + .../__clang_cuda_additional_intrinsics.h | 391 +++ cpp/scripts/analyze_nvcc_log.py | 134 + cpp/scripts/gitutils.py | 286 ++ .../select_k/algorithm_selection.ipynb | 445 +++ .../select_k/generate_heuristic.ipynb | 559 ++++ .../heuristics/select_k/generate_plots.ipynb | 352 +++ .../heuristics/select_k/select_k_dataset.py | 114 + cpp/scripts/include_checker.py | 93 + cpp/scripts/run-clang-compile.py | 347 +++ cpp/scripts/run-clang-tidy.py | 443 +++ cpp/scripts/run-cmake-format.sh | 83 + cpp/src/cuvs_runtime/cluster/cluster_cost.cuh | 87 + .../cluster/cluster_cost_double.cu | 33 + .../cluster/cluster_cost_float.cu | 33 + .../cuvs_runtime/cluster/kmeans_fit_double.cu | 33 + .../cuvs_runtime/cluster/kmeans_fit_float.cu | 33 + .../cluster/kmeans_init_plus_plus_double.cu | 31 + .../cluster/kmeans_init_plus_plus_float.cu | 31 + .../cuvs_runtime/cluster/update_centroids.cuh | 72 + .../cluster/update_centroids_double.cu | 46 + .../cluster/update_centroids_float.cu | 46 + .../cuvs_runtime/distance/fused_l2_min_arg.cu | 105 + .../distance/pairwise_distance.cu | 52 + .../matrix/select_k_float_int64_t.cu | 36 + .../brute_force_knn_int64_t_float.cu | 47 + cpp/src/cuvs_runtime/neighbors/cagra_build.cu | 81 + .../cuvs_runtime/neighbors/cagra_search.cu | 39 + .../cuvs_runtime/neighbors/cagra_serialize.cu | 67 + .../cuvs_runtime/neighbors/ivf_flat_build.cu | 62 + .../cuvs_runtime/neighbors/ivf_flat_search.cu | 40 + .../neighbors/ivf_flat_serialize.cu | 65 + cpp/src/cuvs_runtime/neighbors/ivfpq_build.cu | 59 + .../neighbors/ivfpq_deserialize.cu | 31 + .../neighbors/ivfpq_search_float_int64_t.cu | 38 + .../neighbors/ivfpq_search_int8_t_int64_t.cu | 38 + .../neighbors/ivfpq_search_uint8_t_int64_t.cu | 38 + .../cuvs_runtime/neighbors/ivfpq_serialize.cu | 31 + .../neighbors/refine_d_int64_t_float.cu | 33 + .../neighbors/refine_d_int64_t_int8_t.cu | 33 + .../neighbors/refine_d_int64_t_uint8_t.cu | 33 + .../neighbors/refine_h_int64_t_float.cu | 34 + .../neighbors/refine_h_int64_t_int8_t.cu | 33 + .../neighbors/refine_h_int64_t_uint8_t.cu | 33 + cpp/src/cuvs_runtime/random/common.cuh | 41 + ...rmat_rectangular_generator_int64_double.cu | 23 + .../rmat_rectangular_generator_int64_float.cu | 23 + .../rmat_rectangular_generator_int_double.cu | 23 + .../rmat_rectangular_generator_int_float.cu | 23 + .../pairwise_matrix/dispatch_00_generate.py | 194 ++ ...patch_canberra_double_double_double_int.cu | 55 + ...dispatch_canberra_float_float_float_int.cu | 50 + ...ch_correlation_double_double_double_int.cu | 55 + ...patch_correlation_float_float_float_int.cu | 55 + ...ispatch_cosine_double_double_double_int.cu | 51 + .../dispatch_cosine_float_float_float_int.cu | 51 + ...ing_unexpanded_double_double_double_int.cu | 50 + ...amming_unexpanded_float_float_float_int.cu | 50 + ...inger_expanded_double_double_double_int.cu | 55 + ...ellinger_expanded_float_float_float_int.cu | 50 + ...jensen_shannon_double_double_double_int.cu | 55 + ...ch_jensen_shannon_float_float_float_int.cu | 55 + ..._kl_divergence_double_double_double_int.cu | 50 + ...tch_kl_divergence_float_float_float_int.cu | 50 + .../dispatch_l1_double_double_double_int.cu | 50 + .../dispatch_l1_float_float_float_int.cu | 50 + ...ch_l2_expanded_double_double_double_int.cu | 51 + ...patch_l2_expanded_float_float_float_int.cu | 51 + ..._l2_unexpanded_double_double_double_int.cu | 55 + ...tch_l2_unexpanded_float_float_float_int.cu | 50 + ...dispatch_l_inf_double_double_double_int.cu | 50 + .../dispatch_l_inf_float_float_float_int.cu | 50 + ..._lp_unexpanded_double_double_double_int.cu | 55 + ...tch_lp_unexpanded_float_float_float_int.cu | 50 + .../detail/pairwise_matrix/dispatch_rbf.cu | 64 + ...tch_russel_rao_double_double_double_int.cu | 55 + ...spatch_russel_rao_float_float_float_int.cu | 50 + cpp/src/distance/distance.cu | 934 ++++++ cpp/src/distance/fused_l2_nn.cu | 54 + .../matrix/detail/select_k_double_int64_t.cu | 34 + .../matrix/detail/select_k_double_uint32_t.cu | 35 + cpp/src/matrix/detail/select_k_float_int32.cu | 34 + .../matrix/detail/select_k_float_int64_t.cu | 34 + .../matrix/detail/select_k_float_uint32_t.cu | 34 + .../matrix/detail/select_k_half_int64_t.cu | 34 + .../matrix/detail/select_k_half_uint32_t.cu | 34 + cpp/src/neighbors/ball_cover.cu | 66 + cpp/src/neighbors/brute_force_00_generate.py | 106 + .../brute_force_fused_l2_knn_float_int64_t.cu | 45 + .../neighbors/brute_force_knn_index_float.cu | 39 + .../brute_force_knn_int64_t_float_int64_t.cu | 47 + .../brute_force_knn_int64_t_float_uint32_t.cu | 47 + .../brute_force_knn_int_float_int.cu | 47 + ...brute_force_knn_uint32_t_float_uint32_t.cu | 47 + .../cagra/search_multi_cta_00_generate.py | 108 + ...arch_multi_cta_float_uint32_dim1024_t32.cu | 66 + ...search_multi_cta_float_uint32_dim128_t8.cu | 66 + ...earch_multi_cta_float_uint32_dim256_t16.cu | 66 + ...earch_multi_cta_float_uint32_dim512_t32.cu | 66 + ...arch_multi_cta_float_uint64_dim1024_t32.cu | 66 + ...search_multi_cta_float_uint64_dim128_t8.cu | 66 + ...earch_multi_cta_float_uint64_dim256_t16.cu | 66 + ...earch_multi_cta_float_uint64_dim512_t32.cu | 66 + ...earch_multi_cta_int8_uint32_dim1024_t32.cu | 66 + .../search_multi_cta_int8_uint32_dim128_t8.cu | 66 + ...search_multi_cta_int8_uint32_dim256_t16.cu | 66 + ...search_multi_cta_int8_uint32_dim512_t32.cu | 66 + ...arch_multi_cta_uint8_uint32_dim1024_t32.cu | 66 + ...search_multi_cta_uint8_uint32_dim128_t8.cu | 66 + ...earch_multi_cta_uint8_uint32_dim256_t16.cu | 66 + ...earch_multi_cta_uint8_uint32_dim512_t32.cu | 66 + .../cagra/search_single_cta_00_generate.py | 113 + ...rch_single_cta_float_uint32_dim1024_t32.cu | 67 + ...earch_single_cta_float_uint32_dim128_t8.cu | 67 + ...arch_single_cta_float_uint32_dim256_t16.cu | 67 + ...arch_single_cta_float_uint32_dim512_t32.cu | 67 + ...rch_single_cta_float_uint64_dim1024_t32.cu | 67 + ...earch_single_cta_float_uint64_dim128_t8.cu | 67 + ...arch_single_cta_float_uint64_dim256_t16.cu | 67 + ...arch_single_cta_float_uint64_dim512_t32.cu | 67 + ...arch_single_cta_int8_uint32_dim1024_t32.cu | 67 + ...search_single_cta_int8_uint32_dim128_t8.cu | 67 + ...earch_single_cta_int8_uint32_dim256_t16.cu | 67 + ...earch_single_cta_int8_uint32_dim512_t32.cu | 67 + ...rch_single_cta_uint8_uint32_dim1024_t32.cu | 67 + ...earch_single_cta_uint8_uint32_dim128_t8.cu | 67 + ...arch_single_cta_uint8_uint32_dim256_t16.cu | 67 + ...arch_single_cta_uint8_uint32_dim512_t32.cu | 67 + ...at_interleaved_scan_float_float_int64_t.cu | 42 + ...interleaved_scan_int8_t_int32_t_int64_t.cu | 42 + ...terleaved_scan_uint8_t_uint32_t_int64_t.cu | 42 + cpp/src/neighbors/detail/ivf_flat_search.cu | 40 + .../ivf_pq_compute_similarity_00_generate.py | 108 + .../ivf_pq_compute_similarity_float_float.cu | 81 + ...f_pq_compute_similarity_float_fp8_false.cu | 81 + ...vf_pq_compute_similarity_float_fp8_true.cu | 81 + .../ivf_pq_compute_similarity_float_half.cu | 81 + ...vf_pq_compute_similarity_half_fp8_false.cu | 81 + ...ivf_pq_compute_similarity_half_fp8_true.cu | 81 + .../ivf_pq_compute_similarity_half_half.cu | 81 + .../detail/refine_host_float_float.cpp | 29 + .../detail/refine_host_int8_t_float.cpp | 29 + .../detail/refine_host_uint8_t_float.cpp | 30 + .../detail/selection_faiss_00_generate.py | 79 + .../detail/selection_faiss_int32_t_float.cu | 44 + .../detail/selection_faiss_int64_t_double.cu | 44 + .../detail/selection_faiss_int64_t_half.cu | 44 + .../detail/selection_faiss_int_double.cu | 44 + .../detail/selection_faiss_long_float.cu | 44 + .../detail/selection_faiss_size_t_double.cu | 44 + .../detail/selection_faiss_size_t_float.cu | 44 + .../detail/selection_faiss_uint32_t_double.cu | 44 + .../detail/selection_faiss_uint32_t_float.cu | 44 + .../detail/selection_faiss_uint32_t_half.cu | 44 + cpp/src/neighbors/ivf_flat_00_generate.py | 148 + .../neighbors/ivf_flat_build_float_int64_t.cu | 50 + .../ivf_flat_build_int8_t_int64_t.cu | 50 + .../ivf_flat_build_uint8_t_int64_t.cu | 50 + .../ivf_flat_extend_float_int64_t.cu | 58 + .../ivf_flat_extend_int8_t_int64_t.cu | 58 + .../ivf_flat_extend_uint8_t_int64_t.cu | 58 + .../ivf_flat_search_float_int64_t.cu | 49 + .../ivf_flat_search_int8_t_int64_t.cu | 49 + .../ivf_flat_search_uint8_t_int64_t.cu | 49 + .../neighbors/ivfpq_build_float_int64_t.cu | 36 + .../neighbors/ivfpq_build_int8_t_int64_t.cu | 36 + .../neighbors/ivfpq_build_uint8_t_int64_t.cu | 36 + .../neighbors/ivfpq_extend_float_int64_t.cu | 50 + .../neighbors/ivfpq_extend_int8_t_int64_t.cu | 50 + .../neighbors/ivfpq_extend_uint8_t_int64_t.cu | 50 + .../neighbors/ivfpq_search_float_int64_t.cu | 42 + .../neighbors/ivfpq_search_int8_t_int64_t.cu | 42 + .../neighbors/ivfpq_search_uint8_t_int64_t.cu | 42 + cpp/src/neighbors/refine_00_generate.py | 78 + cpp/src/neighbors/refine_float_float.cu | 50 + cpp/src/neighbors/refine_int8_t_float.cu | 50 + cpp/src/neighbors/refine_uint8_t_float.cu | 50 + .../knn/detail/ball_cover/registers.cu | 60 + .../ball_cover/registers_00_generate.py | 112 + .../ball_cover/registers_pass_one_2d_dist.cu | 48 + .../registers_pass_one_2d_euclidean.cu | 48 + .../registers_pass_one_2d_haversine.cu | 48 + .../ball_cover/registers_pass_one_3d_dist.cu | 48 + .../registers_pass_one_3d_euclidean.cu | 48 + .../registers_pass_one_3d_haversine.cu | 48 + .../ball_cover/registers_pass_two_2d_dist.cu | 48 + .../registers_pass_two_2d_euclidean.cu | 48 + .../registers_pass_two_2d_haversine.cu | 48 + .../ball_cover/registers_pass_two_3d_dist.cu | 48 + .../registers_pass_two_3d_euclidean.cu | 48 + .../registers_pass_two_3d_haversine.cu | 48 + .../knn/detail/fused_l2_knn_int32_t_float.cu | 42 + .../knn/detail/fused_l2_knn_int64_t_float.cu | 42 + .../knn/detail/fused_l2_knn_uint32_t_float.cu | 43 + cpp/template/CMakeLists.txt | 41 + cpp/template/README.md | 18 + cpp/template/build.sh | 41 + .../cmake/thirdparty/fetch_rapids.cmake | 21 + cpp/template/cmake/thirdparty/get_cuvs.cmake | 63 + cpp/template/src/cagra_example.cu | 91 + cpp/template/src/common.cuh | 95 + cpp/template/src/ivf_flat_example.cu | 160 ++ cpp/test/CMakeLists.txt | 472 +++ cpp/test/cluster/cluster_solvers.cu | 103 + .../cluster/cluster_solvers_deprecated.cu | 58 + cpp/test/cluster/kmeans.cu | 359 +++ cpp/test/cluster/kmeans_balanced.cu | 236 ++ cpp/test/cluster/kmeans_find_k.cu | 140 + cpp/test/cluster/linkage.cu | 675 +++++ cpp/test/distance/dist_adj.cu | 194 ++ cpp/test/distance/dist_adj.cuh | 71 + .../distance/dist_adj_distance_instance.cu | 63 + cpp/test/distance/dist_adj_threshold.cuh | 36 + cpp/test/distance/dist_canberra.cu | 70 + cpp/test/distance/dist_correlation.cu | 94 + cpp/test/distance/dist_cos.cu | 110 + cpp/test/distance/dist_hamming.cu | 71 + cpp/test/distance/dist_hellinger.cu | 71 + cpp/test/distance/dist_inner_product.cu | 74 + cpp/test/distance/dist_jensen_shannon.cu | 71 + cpp/test/distance/dist_kl_divergence.cu | 71 + cpp/test/distance/dist_l1.cu | 70 + cpp/test/distance/dist_l2_exp.cu | 113 + cpp/test/distance/dist_l2_sqrt_exp.cu | 74 + cpp/test/distance/dist_l2_unexp.cu | 71 + cpp/test/distance/dist_l_inf.cu | 70 + cpp/test/distance/dist_lp_unexp.cu | 71 + cpp/test/distance/dist_russell_rao.cu | 71 + cpp/test/distance/distance_base.cuh | 673 +++++ cpp/test/distance/fused_l2_nn.cu | 436 +++ cpp/test/distance/gram.cu | 170 ++ cpp/test/distance/gram_base.cuh | 88 + cpp/test/distance/masked_nn.cu | 435 +++ .../distance/masked_nn_compress_to_bits.cu | 217 ++ cpp/test/ext_headers/00_generate.py | 79 + cpp/test/ext_headers/raft_core_logger.cpp | 27 + ...istance_detail_pairwise_matrix_dispatch.cu | 27 + .../ext_headers/raft_distance_distance.cu | 27 + .../ext_headers/raft_distance_fused_l2_nn.cu | 27 + .../raft_linalg_detail_coalesced_reduction.cu | 27 + .../raft_matrix_detail_select_k.cu | 27 + .../ext_headers/raft_neighbors_ball_cover.cu | 27 + .../ext_headers/raft_neighbors_brute_force.cu | 27 + ...ghbors_detail_ivf_flat_interleaved_scan.cu | 27 + .../raft_neighbors_detail_ivf_flat_search.cu | 27 + ...ghbors_detail_ivf_pq_compute_similarity.cu | 27 + .../raft_neighbors_detail_selection_faiss.cu | 27 + .../ext_headers/raft_neighbors_ivf_flat.cu | 27 + cpp/test/ext_headers/raft_neighbors_ivf_pq.cu | 27 + cpp/test/ext_headers/raft_neighbors_refine.cu | 27 + ...spatial_knn_detail_ball_cover_registers.cu | 27 + .../raft_spatial_knn_detail_fused_l2_knn.cu | 27 + .../ext_headers/raft_util_memory_pool.cpp | 27 + cpp/test/neighbors/ann_cagra.cuh | 773 +++++ .../ann_cagra/search_kernel_uint64_t.cuh | 107 + .../neighbors/ann_cagra/test_float_int64_t.cu | 29 + .../ann_cagra/test_float_uint32_t.cu | 40 + .../ann_cagra/test_int8_t_uint32_t.cu | 38 + .../ann_cagra/test_uint8_t_uint32_t.cu | 40 + cpp/test/neighbors/ann_ivf_flat.cuh | 615 ++++ .../ann_ivf_flat/test_filter_float_int64_t.cu | 29 + .../ann_ivf_flat/test_float_int64_t.cu | 32 + .../ann_ivf_flat/test_int8_t_int64_t.cu | 28 + .../ann_ivf_flat/test_uint8_t_int64_t.cu | 28 + cpp/test/neighbors/ann_ivf_pq.cuh | 1095 +++++++ .../ann_ivf_pq/test_filter_float_int64_t.cu | 26 + .../ann_ivf_pq/test_filter_int8_t_int64_t.cu | 27 + .../ann_ivf_pq/test_float_int64_t.cu | 27 + .../ann_ivf_pq/test_float_uint32_t.cu | 37 + .../ann_ivf_pq/test_int8_t_int64_t.cu | 27 + .../ann_ivf_pq/test_uint8_t_int64_t.cu | 27 + cpp/test/neighbors/ann_nn_descent.cuh | 156 + .../ann_nn_descent/test_float_uint32_t.cu | 28 + .../ann_nn_descent/test_int8_t_uint32_t.cu | 28 + .../ann_nn_descent/test_uint8_t_uint32_t.cu | 28 + cpp/test/neighbors/ann_utils.cuh | 294 ++ cpp/test/neighbors/ball_cover.cu | 372 +++ cpp/test/neighbors/epsilon_neighborhood.cu | 121 + cpp/test/neighbors/fused_l2_knn.cu | 175 ++ cpp/test/neighbors/haversine.cu | 133 + cpp/test/neighbors/knn.cu | 197 ++ cpp/test/neighbors/knn_utils.cuh | 94 + cpp/test/neighbors/refine.cu | 129 + cpp/test/neighbors/selection.cu | 499 ++++ cpp/test/neighbors/spatial_data.h | 38 + cpp/test/neighbors/tiled_knn.cu | 354 +++ cpp/test/sparse/dist_coo_spmv.cu | 697 +++++ cpp/test/sparse/distance.cu | 853 ++++++ cpp/test/sparse/gram.cu | 328 +++ cpp/test/sparse/neighbors/brute_force.cu | 179 ++ .../sparse/neighbors/cross_component_nn.cu | 1036 +++++++ cpp/test/sparse/neighbors/knn_graph.cu | 127 + cpp/test/sparse/spectral_matrix.cu | 83 + cpp/test/stats/accuracy.cu | 106 + cpp/test/stats/adjusted_rand_index.cu | 207 ++ cpp/test/stats/completeness_score.cu | 136 + cpp/test/stats/contingencyMatrix.cu | 167 ++ cpp/test/stats/cov.cu | 193 ++ cpp/test/stats/dispersion.cu | 132 + cpp/test/stats/entropy.cu | 123 + cpp/test/stats/histogram.cu | 318 ++ cpp/test/stats/homogeneity_score.cu | 134 + cpp/test/stats/information_criterion.cu | 151 + cpp/test/stats/kl_divergence.cu | 107 + cpp/test/stats/mean.cu | 149 + cpp/test/stats/meanvar.cu | 161 ++ cpp/test/stats/minmax.cu | 208 ++ cpp/test/stats/mutual_info_score.cu | 162 ++ cpp/test/stats/neighborhood_recall.cu | 178 ++ cpp/test/stats/r2_score.cu | 114 + cpp/test/stats/rand_index.cu | 129 + cpp/test/stats/regression_metrics.cu | 146 + cpp/test/stats/silhouette_score.cu | 227 ++ cpp/test/stats/stddev.cu | 196 ++ cpp/test/stats/sum.cu | 111 + cpp/test/stats/trustworthiness.cu | 352 +++ cpp/test/stats/v_measure.cu | 139 + cpp/test/stats/weighted_mean.cu | 339 +++ cpp/test/test.cpp | 25 + cpp/test/test_utils.cuh | 330 +++ cpp/test/test_utils.h | 115 + dependencies.yaml | 422 +++ docs/Makefile | 20 + docs/README.md | 14 + docs/make.bat | 36 + docs/source/_static/references.css | 23 + docs/source/ann_benchmarks_build.md | 48 + docs/source/ann_benchmarks_dataset.md | 63 + docs/source/ann_benchmarks_low_level.md | 219 ++ docs/source/ann_benchmarks_param_tuning.md | 178 ++ docs/source/build.md | 318 ++ docs/source/conf.py | 204 ++ docs/source/contributing.md | 93 + docs/source/cpp_api.rst | 14 + docs/source/cpp_api/cluster.rst | 18 + docs/source/cpp_api/cluster_kmeans.rst | 13 + .../cpp_api/cluster_kmeans_balanced.rst | 13 + docs/source/cpp_api/cluster_slhc.rst | 13 + docs/source/cpp_api/cluster_spectral.rst | 13 + docs/source/cpp_api/distance.rst | 28 + docs/source/cpp_api/distance_1nn.rst | 24 + docs/source/cpp_api/distance_pairwise.rst | 17 + docs/source/cpp_api/neighbors.rst | 19 + docs/source/cpp_api/neighbors_ball_cover.rst | 17 + docs/source/cpp_api/neighbors_brute_force.rst | 18 + docs/source/cpp_api/neighbors_cagra.rst | 31 + .../neighbors_epsilon_neighborhood.rst | 15 + docs/source/cpp_api/neighbors_ivf_flat.rst | 37 + docs/source/cpp_api/neighbors_ivf_pq.rst | 48 + docs/source/cpp_api/sparse.rst | 21 + docs/source/cpp_api/sparse_distance.rst | 7 + docs/source/cpp_api/sparse_neighbors.rst | 7 + docs/source/cpp_api/sparse_types.rst | 22 + docs/source/cpp_api/stats.rst | 19 + docs/source/cpp_api/stats_clustering.rst | 81 + docs/source/cpp_api/stats_neighborhood.rst | 30 + docs/source/cpp_api/stats_probability.rst | 56 + docs/source/developer_guide.md | 553 ++++ docs/source/index.rst | 77 + docs/source/pylibraft_api.rst | 15 + docs/source/python_api/cluster.rst | 20 + docs/source/python_api/common.rst | 38 + docs/source/python_api/distance.rst | 15 + docs/source/python_api/matrix.rst | 11 + docs/source/python_api/neighbors.rst | 83 + docs/source/quick_start.md | 183 ++ docs/source/raft_ann_benchmarks.md | 591 ++++ docs/source/sphinxext/github_link.py | 146 + docs/source/vector_search_tutorial.md | 343 +++ docs/source/wiki_all_dataset.md | 46 + fetch_rapids.cmake | 20 + img/raft-vector-search-batch-10.png | Bin 0 -> 61807 bytes img/rapids_arrow.png | Bin 0 -> 192477 bytes img/rapids_logo.png | Bin 0 -> 113880 bytes .../VectorSearch_QuestionRetrieval.ipynb | 628 ++++ notebooks/ivf_flat_example.ipynb | 674 +++++ notebooks/tutorial_ivf_pq.ipynb | 1365 +++++++++ notebooks/utils.py | 103 + pyproject.toml | 47 + python/cuda-ann-bench/LICENSE | 1 + python/cuda-ann-bench/pyproject.toml | 59 + .../src/cuda-ann-bench/__init__.py | 14 + .../cuda-ann-bench/constraints/__init__.py | 45 + .../cuda-ann-bench/data_export/__main__.py | 247 ++ .../generate_groundtruth/__main__.py | 245 ++ .../generate_groundtruth/utils.py | 103 + .../cuda-ann-bench/get_dataset/__main__.py | 115 + .../get_dataset/fbin_to_f16bin.py | 49 + .../get_dataset/hdf5_to_fbin.py | 90 + .../src/cuda-ann-bench/plot/__main__.py | 582 ++++ .../src/cuda-ann-bench/run/__main__.py | 599 ++++ .../src/cuda-ann-bench/run/algos.yaml | 42 + .../run/conf/algos/faiss_gpu_ivf_flat.yaml | 10 + .../run/conf/algos/faiss_gpu_ivf_pq.yaml | 12 + .../run/conf/algos/hnswlib.yaml | 10 + .../run/conf/algos/raft_cagra.yaml | 12 + .../run/conf/algos/raft_cagra_hnswlib.yaml | 11 + .../run/conf/algos/raft_ivf_flat.yaml | 9 + .../run/conf/algos/raft_ivf_pq.yaml | 17 + .../cuda-ann-bench/run/conf/bigann-100M.json | 192 ++ .../src/cuda-ann-bench/run/conf/datasets.yaml | 127 + .../cuda-ann-bench/run/conf/deep-100M.json | 458 +++ .../src/cuda-ann-bench/run/conf/deep-1B.json | 34 + .../run/conf/deep-image-96-inner.json | 1013 +++++++ .../run/conf/fashion-mnist-784-euclidean.json | 1352 +++++++++ .../run/conf/gist-960-euclidean.json | 1351 +++++++++ .../run/conf/glove-100-angular.json | 1351 +++++++++ .../run/conf/glove-100-inner.json | 1314 +++++++++ .../run/conf/glove-50-angular.json | 1351 +++++++++ .../run/conf/glove-50-inner.json | 1351 +++++++++ .../run/conf/lastfm-65-angular.json | 1351 +++++++++ .../run/conf/mnist-784-euclidean.json | 1352 +++++++++ .../run/conf/nytimes-256-angular.json | 1352 +++++++++ .../run/conf/nytimes-256-inner.json | 1352 +++++++++ .../run/conf/sift-128-euclidean.json | 498 ++++ .../cuda-ann-bench/run/conf/wiki_all_10M.json | 200 ++ .../cuda-ann-bench/run/conf/wiki_all_1M.json | 216 ++ .../cuda-ann-bench/run/conf/wiki_all_88M.json | 200 ++ .../split_groundtruth/__main__.py | 57 + .../split_groundtruth/split_groundtruth.pl | 45 + python/cuvs/.coveragerc | 3 + python/cuvs/CMakeLists.txt | 92 + python/cuvs/LICENSE | 1 + python/cuvs/cuvs/VERSION | 1 + python/cuvs/cuvs/__init__.py | 16 + python/cuvs/cuvs/_version.py | 22 + python/cuvs/cuvs/cluster/CMakeLists.txt | 24 + python/cuvs/cuvs/cluster/__init__.pxd | 14 + python/cuvs/cuvs/cluster/__init__.py | 30 + python/cuvs/cuvs/cluster/cpp/__init__.pxd | 0 python/cuvs/cuvs/cluster/cpp/__init__.py | 0 python/cuvs/cuvs/cluster/cpp/kmeans.pxd | 105 + python/cuvs/cuvs/cluster/cpp/kmeans_types.pxd | 43 + python/cuvs/cuvs/cluster/kmeans.pyx | 589 ++++ python/cuvs/cuvs/distance/CMakeLists.txt | 24 + python/cuvs/cuvs/distance/__init__.pxd | 14 + python/cuvs/cuvs/distance/__init__.py | 19 + python/cuvs/cuvs/distance/distance_type.pxd | 44 + python/cuvs/cuvs/distance/fused_l2_nn.pyx | 193 ++ .../cuvs/cuvs/distance/pairwise_distance.pyx | 242 ++ python/cuvs/cuvs/matrix/CMakeLists.txt | 24 + python/cuvs/cuvs/matrix/__init__.pxd | 14 + python/cuvs/cuvs/matrix/__init__.py | 18 + python/cuvs/cuvs/matrix/cpp/__init__.pxd | 0 python/cuvs/cuvs/matrix/cpp/__init__.py | 14 + python/cuvs/cuvs/matrix/cpp/select_k.pxd | 38 + python/cuvs/cuvs/matrix/select_k.pyx | 132 + python/cuvs/cuvs/neighbors/CMakeLists.txt | 28 + python/cuvs/cuvs/neighbors/__init__.pxd | 14 + python/cuvs/cuvs/neighbors/__init__.py | 20 + python/cuvs/cuvs/neighbors/brute_force.pyx | 177 ++ .../cuvs/cuvs/neighbors/cagra/CMakeLists.txt | 24 + python/cuvs/cuvs/neighbors/cagra/__init__.pxd | 0 python/cuvs/cuvs/neighbors/cagra/__init__.py | 26 + python/cuvs/cuvs/neighbors/cagra/cagra.pyx | 903 ++++++ .../cuvs/neighbors/cagra/cpp/__init__.pxd | 0 .../cuvs/cuvs/neighbors/cagra/cpp/__init__.py | 14 + .../cuvs/cuvs/neighbors/cagra/cpp/c_cagra.pxd | 223 ++ python/cuvs/cuvs/neighbors/common.pxd | 24 + python/cuvs/cuvs/neighbors/common.pyx | 63 + python/cuvs/cuvs/neighbors/cpp/__init__.pxd | 0 python/cuvs/cuvs/neighbors/cpp/__init__.py | 14 + .../cuvs/cuvs/neighbors/cpp/brute_force.pxd | 53 + .../cuvs/neighbors/ivf_flat/CMakeLists.txt | 24 + .../cuvs/cuvs/neighbors/ivf_flat/__init__.pxd | 0 .../cuvs/cuvs/neighbors/ivf_flat/__init__.py | 36 + .../cuvs/neighbors/ivf_flat/cpp/__init__.pxd | 0 .../cuvs/neighbors/ivf_flat/cpp/__init__.py | 14 + .../neighbors/ivf_flat/cpp/c_ivf_flat.pxd | 181 ++ .../cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx | 822 ++++++ .../cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt | 24 + .../cuvs/cuvs/neighbors/ivf_pq/__init__.pxd | 0 python/cuvs/cuvs/neighbors/ivf_pq/__init__.py | 36 + .../cuvs/neighbors/ivf_pq/cpp/__init__.pxd | 0 .../cuvs/neighbors/ivf_pq/cpp/__init__.py | 14 + .../cuvs/neighbors/ivf_pq/cpp/c_ivf_pq.pxd | 176 ++ python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd | 25 + python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx | 798 ++++++ python/cuvs/cuvs/neighbors/refine.pyx | 374 +++ python/cuvs/cuvs/test/test_brute_force.py | 110 + python/cuvs/cuvs/test/test_cagra.py | 311 ++ python/cuvs/cuvs/test/test_device_ndarray.py | 62 + python/cuvs/cuvs/test/test_distance.py | 79 + python/cuvs/cuvs/test/test_doctests.py | 128 + python/cuvs/cuvs/test/test_fused_l2_argmin.py | 52 + python/cuvs/cuvs/test/test_ivf_flat.py | 509 ++++ python/cuvs/cuvs/test/test_ivf_pq.py | 549 ++++ python/cuvs/cuvs/test/test_kmeans.py | 205 ++ python/cuvs/cuvs/test/test_refine.py | 232 ++ python/cuvs/cuvs/test/test_select_k.py | 53 + python/cuvs/pyproject.toml | 111 + python/cuvs/setup.cfg | 38 + python/cuvs/setup.py | 37 + setup.cfg | 55 + thirdparty/LICENSES/LICENSE.ann-benchmark | 21 + thirdparty/LICENSES/LICENSE.faiss | 21 + thirdparty/LICENSES/LICENSE.pytorch | 77 + thirdparty/LICENSES/LICENSE_Date_Nagi | 201 ++ 958 files changed, 151118 insertions(+) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/README.md create mode 100644 .devcontainer/cuda11.8-conda/devcontainer.json create mode 100644 .devcontainer/cuda11.8-pip/devcontainer.json create mode 100644 .devcontainer/cuda12.0-conda/devcontainer.json create mode 100644 .devcontainer/cuda12.0-pip/devcontainer.json create mode 100644 .flake8 create mode 100755 .github/CODEOWNERS create mode 100755 .github/ISSUE_TEMPLATE/bug_report.md create mode 100755 .github/ISSUE_TEMPLATE/documentation-request.md create mode 100755 .github/ISSUE_TEMPLATE/feature_request.md create mode 100755 .github/ISSUE_TEMPLATE/submit-question.md create mode 100755 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/copy-pr-bot.yaml create mode 100644 .github/labeler.yml create mode 100644 .github/ops-bot.yaml create mode 100644 .github/workflows/build.yaml create mode 100644 .github/workflows/labeler.yml create mode 100644 .github/workflows/pr.yaml create mode 100644 .github/workflows/test.yaml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 CHANGELOG.md create mode 100755 LICENSE create mode 100644 VERSION create mode 100755 build.sh create mode 100755 ci/build_cpp.sh create mode 100755 ci/build_docs.sh create mode 100755 ci/build_python.sh create mode 100755 ci/build_wheel.sh create mode 100755 ci/build_wheel_cuvs.sh create mode 100755 ci/check_style.sh create mode 100755 ci/checks/black_lists.sh create mode 100644 ci/checks/copyright.py create mode 100755 ci/release/update-version.sh create mode 100755 ci/test_cpp.sh create mode 100755 ci/test_python.sh create mode 100755 ci/test_wheel_cuvs.sh create mode 100644 ci/wheel_smoke_test_cuvs.py create mode 100644 conda/environments/all_cuda-118_arch-aarch64.yaml create mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml create mode 100644 conda/environments/all_cuda-120_arch-aarch64.yaml create mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml create mode 100644 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml create mode 100644 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml create mode 100644 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml create mode 100644 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml create mode 100644 conda/recipes/cuda-ann-bench-cpu/build.sh create mode 100644 conda/recipes/cuda-ann-bench-cpu/conda_build_config.yaml create mode 100644 conda/recipes/cuda-ann-bench-cpu/meta.yaml create mode 100644 conda/recipes/cuda-ann-bench/build.sh create mode 100644 conda/recipes/cuda-ann-bench/conda_build_config.yaml create mode 100644 conda/recipes/cuda-ann-bench/meta.yaml create mode 100644 conda/recipes/cuvs/build.sh create mode 100644 conda/recipes/cuvs/conda_build_config.yaml create mode 100644 conda/recipes/cuvs/meta.yaml create mode 100644 conda/recipes/libcuvs/build_libcuvs.sh create mode 100644 conda/recipes/libcuvs/build_libcuvs_static.sh create mode 100644 conda/recipes/libcuvs/build_libcuvs_template.sh create mode 100644 conda/recipes/libcuvs/build_libraft_tests.sh create mode 100644 conda/recipes/libcuvs/conda_build_config.yaml create mode 100644 conda/recipes/libcuvs/meta.yaml create mode 100644 cpp/.clang-format create mode 100644 cpp/.clang-tidy create mode 100644 cpp/.clangd create mode 100644 cpp/CMakeLists.txt create mode 100644 cpp/bench/ann/CMakeLists.txt create mode 100644 cpp/bench/ann/README.md create mode 100644 cpp/bench/ann/src/common/ann_types.hpp create mode 100644 cpp/bench/ann/src/common/benchmark.cpp create mode 100644 cpp/bench/ann/src/common/benchmark.hpp create mode 100644 cpp/bench/ann/src/common/conf.hpp create mode 100644 cpp/bench/ann/src/common/cuda_huge_page_resource.hpp create mode 100644 cpp/bench/ann/src/common/cuda_pinned_resource.hpp create mode 100644 cpp/bench/ann/src/common/cuda_stub.hpp create mode 100644 cpp/bench/ann/src/common/dataset.hpp create mode 100644 cpp/bench/ann/src/common/thread_pool.hpp create mode 100644 cpp/bench/ann/src/common/util.hpp create mode 100644 cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp create mode 100644 cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h create mode 100644 cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu create mode 100644 cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h create mode 100644 cpp/bench/ann/src/ggnn/ggnn_benchmark.cu create mode 100644 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh create mode 100644 cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp create mode 100644 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h create mode 100644 cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h create mode 100644 cpp/bench/ann/src/raft/raft_ann_bench_utils.h create mode 100644 cpp/bench/ann/src/raft/raft_benchmark.cu create mode 100644 cpp/bench/ann/src/raft/raft_cagra.cu create mode 100644 cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu create mode 100644 cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h create mode 100644 cpp/bench/ann/src/raft/raft_cagra_wrapper.h create mode 100644 cpp/bench/ann/src/raft/raft_ivf_flat.cu create mode 100644 cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h create mode 100644 cpp/bench/ann/src/raft/raft_ivf_pq.cu create mode 100644 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h create mode 100644 cpp/bench/ann/src/raft/raft_wrapper.h create mode 100644 cpp/bench/prims/CMakeLists.txt create mode 100644 cpp/bench/prims/cluster/kmeans.cu create mode 100644 cpp/bench/prims/cluster/kmeans_balanced.cu create mode 100644 cpp/bench/prims/common/benchmark.hpp create mode 100644 cpp/bench/prims/core/bitset.cu create mode 100644 cpp/bench/prims/core/copy.cu create mode 100644 cpp/bench/prims/distance/distance_common.cuh create mode 100644 cpp/bench/prims/distance/distance_cosine.cu create mode 100644 cpp/bench/prims/distance/distance_exp_l2.cu create mode 100644 cpp/bench/prims/distance/distance_l1.cu create mode 100644 cpp/bench/prims/distance/distance_unexp_l2.cu create mode 100644 cpp/bench/prims/distance/fused_l2_nn.cu create mode 100644 cpp/bench/prims/distance/kernels.cu create mode 100644 cpp/bench/prims/distance/masked_nn.cu create mode 100644 cpp/bench/prims/distance/tune_pairwise/bench.cu create mode 100644 cpp/bench/prims/distance/tune_pairwise/kernel.cu create mode 100644 cpp/bench/prims/distance/tune_pairwise/kernel.cuh create mode 100644 cpp/bench/prims/linalg/add.cu create mode 100644 cpp/bench/prims/linalg/map_then_reduce.cu create mode 100644 cpp/bench/prims/linalg/matrix_vector_op.cu create mode 100644 cpp/bench/prims/linalg/norm.cu create mode 100644 cpp/bench/prims/linalg/normalize.cu create mode 100644 cpp/bench/prims/linalg/reduce.cu create mode 100644 cpp/bench/prims/linalg/reduce_cols_by_key.cu create mode 100644 cpp/bench/prims/linalg/reduce_rows_by_key.cu create mode 100644 cpp/bench/prims/main.cpp create mode 100644 cpp/bench/prims/matrix/argmin.cu create mode 100644 cpp/bench/prims/matrix/gather.cu create mode 100644 cpp/bench/prims/matrix/main.cpp create mode 100644 cpp/bench/prims/matrix/select_k.cu create mode 100644 cpp/bench/prims/neighbors/cagra_bench.cuh create mode 100644 cpp/bench/prims/neighbors/knn.cuh create mode 100644 cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/refine.cuh create mode 100644 cpp/bench/prims/neighbors/refine_float_int64_t.cu create mode 100644 cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu create mode 100644 cpp/bench/prims/random/make_blobs.cu create mode 100644 cpp/bench/prims/random/permute.cu create mode 100644 cpp/bench/prims/random/rng.cu create mode 100644 cpp/bench/prims/sparse/convert_csr.cu create mode 100644 cpp/cmake/config.json create mode 100644 cpp/cmake/modules/ConfigureCUDA.cmake create mode 100644 cpp/cmake/modules/FindAVX.cmake create mode 100644 cpp/cmake/patches/ggnn.patch create mode 100644 cpp/cmake/patches/hnswlib.patch create mode 100644 cpp/cmake/patches/nlohmann_json.patch create mode 100644 cpp/cmake/thirdparty/get_cutlass.cmake create mode 100644 cpp/cmake/thirdparty/get_faiss.cmake create mode 100644 cpp/cmake/thirdparty/get_fmt.cmake create mode 100644 cpp/cmake/thirdparty/get_ggnn.cmake create mode 100644 cpp/cmake/thirdparty/get_glog.cmake create mode 100644 cpp/cmake/thirdparty/get_gtest.cmake create mode 100644 cpp/cmake/thirdparty/get_hnswlib.cmake create mode 100644 cpp/cmake/thirdparty/get_nlohmann_json.cmake create mode 100644 cpp/cmake/thirdparty/get_rmm.cmake create mode 100644 cpp/cmake/thirdparty/get_spdlog.cmake create mode 100644 cpp/cmake/thirdparty/get_thrust.cmake create mode 100644 cpp/doxygen/Doxyfile create mode 100644 cpp/doxygen/header.html create mode 100644 cpp/doxygen/main_page.md create mode 100644 cpp/include/cuvs/cluster/detail/agglomerative.cuh create mode 100644 cpp/include/cuvs/cluster/detail/connectivities.cuh create mode 100644 cpp/include/cuvs/cluster/detail/kmeans.cuh create mode 100644 cpp/include/cuvs/cluster/detail/kmeans_auto_find_k.cuh create mode 100644 cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh create mode 100644 cpp/include/cuvs/cluster/detail/kmeans_common.cuh create mode 100644 cpp/include/cuvs/cluster/detail/kmeans_deprecated.cuh create mode 100644 cpp/include/cuvs/cluster/detail/mst.cuh create mode 100644 cpp/include/cuvs/cluster/detail/single_linkage.cuh create mode 100644 cpp/include/cuvs/cluster/kmeans.cuh create mode 100644 cpp/include/cuvs/cluster/kmeans_balanced.cuh create mode 100644 cpp/include/cuvs/cluster/kmeans_balanced_types.hpp create mode 100644 cpp/include/cuvs/cluster/kmeans_deprecated.cuh create mode 100644 cpp/include/cuvs/cluster/kmeans_types.hpp create mode 100644 cpp/include/cuvs/cluster/single_linkage.cuh create mode 100644 cpp/include/cuvs/cluster/single_linkage_types.hpp create mode 100644 cpp/include/cuvs/cluster/specializations.cuh create mode 100644 cpp/include/cuvs/distance/detail/compress_to_bits.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/all_ops.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/canberra.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/correlation.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/cosine.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/cutlass.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/hamming.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/hellinger.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/jensen_shannon.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/kl_divergence.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/l1.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/l2_exp.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/l2_unexp.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/l_inf.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/lp_unexp.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/russel_rao.cuh create mode 100644 cpp/include/cuvs/distance/detail/distance_ops/template.cuh create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/cutlass_base.cuh create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue.cuh create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue_elementwise.cuh create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/gemm.h create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/persistent_gemm.h create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h create mode 100644 cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h create mode 100644 cpp/include/cuvs/distance/detail/fused_l2_nn.cuh create mode 100644 cpp/include/cuvs/distance/detail/kernels/gram_matrix.cuh create mode 100644 cpp/include/cuvs/distance/detail/kernels/kernel_factory.cuh create mode 100644 cpp/include/cuvs/distance/detail/kernels/kernel_matrices.cuh create mode 100644 cpp/include/cuvs/distance/detail/kernels/rbf_fin_op.cuh create mode 100644 cpp/include/cuvs/distance/detail/masked_distance_base.cuh create mode 100644 cpp/include/cuvs/distance/detail/masked_nn.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_distance_cutlass_base.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_distance_epilogue.h create mode 100644 cpp/include/cuvs/distance/detail/pairwise_distance_epilogue_elementwise.h create mode 100644 cpp/include/cuvs/distance/detail/pairwise_distance_gemm.h create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-ext.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-inl.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_layout.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm60.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm80.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/kernel_sm60.cuh create mode 100644 cpp/include/cuvs/distance/detail/pairwise_matrix/params.cuh create mode 100644 cpp/include/cuvs/distance/detail/predicated_tile_iterator_normvec.h create mode 100644 cpp/include/cuvs/distance/distance-ext.cuh create mode 100644 cpp/include/cuvs/distance/distance-inl.cuh create mode 100644 cpp/include/cuvs/distance/distance.cuh create mode 100644 cpp/include/cuvs/distance/distance_types.hpp create mode 100644 cpp/include/cuvs/distance/fused_l2_nn-ext.cuh create mode 100644 cpp/include/cuvs/distance/fused_l2_nn-inl.cuh create mode 100644 cpp/include/cuvs/distance/fused_l2_nn.cuh create mode 100644 cpp/include/cuvs/distance/fused_l2_nn_helpers.cuh create mode 100644 cpp/include/cuvs/distance/kernels.cuh create mode 100644 cpp/include/cuvs/distance/masked_nn.cuh create mode 100644 cpp/include/cuvs/distance/specializations.cuh create mode 100644 cpp/include/cuvs/distance/specializations/distance.cuh create mode 100644 cpp/include/cuvs/distance/specializations/fused_l2_nn_min.cuh create mode 100644 cpp/include/cuvs/neighbors/ann_types.hpp create mode 100644 cpp/include/cuvs/neighbors/ball_cover-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/ball_cover-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/ball_cover.cuh create mode 100644 cpp/include/cuvs/neighbors/ball_cover_types.hpp create mode 100644 cpp/include/cuvs/neighbors/brute_force-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/brute_force-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/brute_force.cuh create mode 100644 cpp/include/cuvs/neighbors/brute_force_types.hpp create mode 100644 cpp/include/cuvs/neighbors/cagra.cuh create mode 100644 cpp/include/cuvs/neighbors/cagra_serialize.cuh create mode 100644 cpp/include/cuvs/neighbors/cagra_types.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/bitonic.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/compute_distance.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/device_common.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/factory.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/fragment.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/graph_core.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/hashmap.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_multi_cta.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_multi_cta_kernel.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_plan.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_single_cta.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/search_single_cta_kernel.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/topk_by_radix.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/topk_for_cagra/topk.h create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/cagra/utils.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/div_utils.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/Comparators.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/DistanceUtils.h create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkBlock.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkUtils.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/StaticUtils.h create mode 100644 cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_search-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_search.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_fp_8bit.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/ivf_pq_serialize.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/knn_brute_force.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/knn_brute_force_batch_k_query.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/nn_descent.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/refine.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/refine_common.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/refine_device.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/refine_host-ext.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/refine_host.hpp create mode 100644 cpp/include/cuvs/neighbors/detail/selection_faiss-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/selection_faiss.cuh create mode 100644 cpp/include/cuvs/neighbors/detail/selection_faiss_helpers.cuh create mode 100644 cpp/include/cuvs/neighbors/epsilon_neighborhood.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_flat-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_flat-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_flat.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_flat_codepacker.hpp create mode 100644 cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_flat_serialize.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_flat_types.hpp create mode 100644 cpp/include/cuvs/neighbors/ivf_list.hpp create mode 100644 cpp/include/cuvs/neighbors/ivf_list_types.hpp create mode 100644 cpp/include/cuvs/neighbors/ivf_pq-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_pq-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_pq.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_pq_serialize.cuh create mode 100644 cpp/include/cuvs/neighbors/ivf_pq_types.hpp create mode 100644 cpp/include/cuvs/neighbors/neighbors_types.hpp create mode 100644 cpp/include/cuvs/neighbors/nn_descent.cuh create mode 100644 cpp/include/cuvs/neighbors/nn_descent_types.hpp create mode 100644 cpp/include/cuvs/neighbors/refine-ext.cuh create mode 100644 cpp/include/cuvs/neighbors/refine-inl.cuh create mode 100644 cpp/include/cuvs/neighbors/refine.cuh create mode 100644 cpp/include/cuvs/neighbors/sample_filter.cuh create mode 100644 cpp/include/cuvs/neighbors/sample_filter_types.hpp create mode 100644 cpp/include/cuvs/neighbors/specializations.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/ball_cover.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/brute_force.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/detail/ball_cover_lowdim.hpp create mode 100644 cpp/include/cuvs/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/fused_l2_knn.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/ivf_flat.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/ivf_pq.cuh create mode 100644 cpp/include/cuvs/neighbors/specializations/refine.cuh create mode 100644 cpp/include/cuvs/spatial/knn/ann.cuh create mode 100644 cpp/include/cuvs/spatial/knn/ann_common.h create mode 100644 cpp/include/cuvs/spatial/knn/ann_types.hpp create mode 100644 cpp/include/cuvs/spatial/knn/ball_cover.cuh create mode 100644 cpp/include/cuvs/spatial/knn/ball_cover_types.hpp create mode 100644 cpp/include/cuvs/spatial/knn/common.hpp create mode 100644 cpp/include/cuvs/spatial/knn/detail/ann_quantized.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ann_utils.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ball_cover/common.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-ext.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ball_cover/registers.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/ball_cover/registers_types.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/epsilon_neighborhood.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/fused_l2_knn-ext.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/fused_l2_knn-inl.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/fused_l2_knn.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/haversine_distance.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/processing.cuh create mode 100644 cpp/include/cuvs/spatial/knn/detail/processing.hpp create mode 100644 cpp/include/cuvs/spatial/knn/epsilon_neighborhood.cuh create mode 100644 cpp/include/cuvs/spatial/knn/ivf_flat.cuh create mode 100644 cpp/include/cuvs/spatial/knn/ivf_flat_types.hpp create mode 100644 cpp/include/cuvs/spatial/knn/ivf_pq.cuh create mode 100644 cpp/include/cuvs/spatial/knn/ivf_pq_types.hpp create mode 100644 cpp/include/cuvs/spatial/knn/knn.cuh create mode 100644 cpp/include/cuvs/spatial/knn/specializations.cuh create mode 100644 cpp/include/cuvs/spatial/knn/specializations/knn.cuh create mode 100644 cpp/include/cuvs/spectral/cluster_solvers.cuh create mode 100644 cpp/include/cuvs/spectral/cluster_solvers_deprecated.cuh create mode 100644 cpp/include/cuvs/spectral/detail/lapack.hpp create mode 100644 cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp create mode 100644 cpp/include/cuvs/spectral/detail/modularity_maximization.hpp create mode 100644 cpp/include/cuvs/spectral/detail/partition.hpp create mode 100644 cpp/include/cuvs/spectral/detail/spectral_util.cuh create mode 100644 cpp/include/cuvs/spectral/detail/warn_dbg.hpp create mode 100644 cpp/include/cuvs/spectral/eigen_solvers.cuh create mode 100644 cpp/include/cuvs/spectral/matrix_wrappers.hpp create mode 100644 cpp/include/cuvs/spectral/modularity_maximization.cuh create mode 100644 cpp/include/cuvs/spectral/partition.cuh create mode 100644 cpp/include/cuvs/spectral/specializations.cuh create mode 100644 cpp/include/cuvs/stats/accuracy.cuh create mode 100644 cpp/include/cuvs/stats/adjusted_rand_index.cuh create mode 100644 cpp/include/cuvs/stats/completeness_score.cuh create mode 100644 cpp/include/cuvs/stats/contingency_matrix.cuh create mode 100644 cpp/include/cuvs/stats/cov.cuh create mode 100644 cpp/include/cuvs/stats/detail/adjusted_rand_index.cuh create mode 100644 cpp/include/cuvs/stats/detail/batched/information_criterion.cuh create mode 100644 cpp/include/cuvs/stats/detail/batched/silhouette_score.cuh create mode 100644 cpp/include/cuvs/stats/detail/contingencyMatrix.cuh create mode 100644 cpp/include/cuvs/stats/detail/cov.cuh create mode 100644 cpp/include/cuvs/stats/detail/dispersion.cuh create mode 100644 cpp/include/cuvs/stats/detail/entropy.cuh create mode 100644 cpp/include/cuvs/stats/detail/histogram.cuh create mode 100644 cpp/include/cuvs/stats/detail/homogeneity_score.cuh create mode 100644 cpp/include/cuvs/stats/detail/kl_divergence.cuh create mode 100644 cpp/include/cuvs/stats/detail/mean.cuh create mode 100644 cpp/include/cuvs/stats/detail/mean_center.cuh create mode 100644 cpp/include/cuvs/stats/detail/meanvar.cuh create mode 100644 cpp/include/cuvs/stats/detail/minmax.cuh create mode 100644 cpp/include/cuvs/stats/detail/mutual_info_score.cuh create mode 100644 cpp/include/cuvs/stats/detail/neighborhood_recall.cuh create mode 100644 cpp/include/cuvs/stats/detail/rand_index.cuh create mode 100644 cpp/include/cuvs/stats/detail/scores.cuh create mode 100644 cpp/include/cuvs/stats/detail/silhouette_score.cuh create mode 100644 cpp/include/cuvs/stats/detail/stddev.cuh create mode 100644 cpp/include/cuvs/stats/detail/sum.cuh create mode 100644 cpp/include/cuvs/stats/detail/trustworthiness_score.cuh create mode 100644 cpp/include/cuvs/stats/detail/v_measure.cuh create mode 100644 cpp/include/cuvs/stats/detail/weighted_mean.cuh create mode 100644 cpp/include/cuvs/stats/dispersion.cuh create mode 100644 cpp/include/cuvs/stats/entropy.cuh create mode 100644 cpp/include/cuvs/stats/histogram.cuh create mode 100644 cpp/include/cuvs/stats/homogeneity_score.cuh create mode 100644 cpp/include/cuvs/stats/information_criterion.cuh create mode 100644 cpp/include/cuvs/stats/kl_divergence.cuh create mode 100644 cpp/include/cuvs/stats/mean.cuh create mode 100644 cpp/include/cuvs/stats/mean_center.cuh create mode 100644 cpp/include/cuvs/stats/meanvar.cuh create mode 100644 cpp/include/cuvs/stats/minmax.cuh create mode 100644 cpp/include/cuvs/stats/mutual_info_score.cuh create mode 100644 cpp/include/cuvs/stats/neighborhood_recall.cuh create mode 100644 cpp/include/cuvs/stats/r2_score.cuh create mode 100644 cpp/include/cuvs/stats/rand_index.cuh create mode 100644 cpp/include/cuvs/stats/regression_metrics.cuh create mode 100644 cpp/include/cuvs/stats/silhouette_score.cuh create mode 100644 cpp/include/cuvs/stats/specializations.cuh create mode 100644 cpp/include/cuvs/stats/stats_types.hpp create mode 100644 cpp/include/cuvs/stats/stddev.cuh create mode 100644 cpp/include/cuvs/stats/sum.cuh create mode 100644 cpp/include/cuvs/stats/trustworthiness_score.cuh create mode 100644 cpp/include/cuvs/stats/v_measure.cuh create mode 100644 cpp/include/cuvs/stats/weighted_mean.cuh create mode 100644 cpp/include/cuvs_runtime/cluster/kmeans.hpp create mode 100644 cpp/include/cuvs_runtime/distance/fused_l2_nn.hpp create mode 100644 cpp/include/cuvs_runtime/distance/pairwise_distance.hpp create mode 100644 cpp/include/cuvs_runtime/matrix/select_k.hpp create mode 100644 cpp/include/cuvs_runtime/neighbors/brute_force.hpp create mode 100644 cpp/include/cuvs_runtime/neighbors/cagra.hpp create mode 100644 cpp/include/cuvs_runtime/neighbors/ivf_flat.hpp create mode 100644 cpp/include/cuvs_runtime/neighbors/ivf_pq.hpp create mode 100644 cpp/include/cuvs_runtime/neighbors/refine.hpp create mode 100644 cpp/internal/CMakeLists.txt create mode 100644 cpp/internal/cuvs_internal/matrix/select_k.cuh create mode 100644 cpp/internal/cuvs_internal/neighbors/naive_knn.cuh create mode 100644 cpp/internal/cuvs_internal/neighbors/refine_helper.cuh create mode 100644 cpp/scripts/__clang_cuda_additional_intrinsics.h create mode 100755 cpp/scripts/analyze_nvcc_log.py create mode 100644 cpp/scripts/gitutils.py create mode 100644 cpp/scripts/heuristics/select_k/algorithm_selection.ipynb create mode 100644 cpp/scripts/heuristics/select_k/generate_heuristic.ipynb create mode 100644 cpp/scripts/heuristics/select_k/generate_plots.ipynb create mode 100644 cpp/scripts/heuristics/select_k/select_k_dataset.py create mode 100644 cpp/scripts/include_checker.py create mode 100644 cpp/scripts/run-clang-compile.py create mode 100644 cpp/scripts/run-clang-tidy.py create mode 100755 cpp/scripts/run-cmake-format.sh create mode 100644 cpp/src/cuvs_runtime/cluster/cluster_cost.cuh create mode 100644 cpp/src/cuvs_runtime/cluster/cluster_cost_double.cu create mode 100644 cpp/src/cuvs_runtime/cluster/cluster_cost_float.cu create mode 100644 cpp/src/cuvs_runtime/cluster/kmeans_fit_double.cu create mode 100644 cpp/src/cuvs_runtime/cluster/kmeans_fit_float.cu create mode 100644 cpp/src/cuvs_runtime/cluster/kmeans_init_plus_plus_double.cu create mode 100644 cpp/src/cuvs_runtime/cluster/kmeans_init_plus_plus_float.cu create mode 100644 cpp/src/cuvs_runtime/cluster/update_centroids.cuh create mode 100644 cpp/src/cuvs_runtime/cluster/update_centroids_double.cu create mode 100644 cpp/src/cuvs_runtime/cluster/update_centroids_float.cu create mode 100644 cpp/src/cuvs_runtime/distance/fused_l2_min_arg.cu create mode 100644 cpp/src/cuvs_runtime/distance/pairwise_distance.cu create mode 100644 cpp/src/cuvs_runtime/matrix/select_k_float_int64_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/brute_force_knn_int64_t_float.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/cagra_build.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/cagra_search.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/cagra_serialize.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivf_flat_build.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivf_flat_search.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivf_flat_serialize.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivfpq_build.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivfpq_deserialize.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivfpq_search_float_int64_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/ivfpq_serialize.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/refine_d_int64_t_float.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/refine_d_int64_t_int8_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/refine_d_int64_t_uint8_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/refine_h_int64_t_float.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/refine_h_int64_t_int8_t.cu create mode 100644 cpp/src/cuvs_runtime/neighbors/refine_h_int64_t_uint8_t.cu create mode 100644 cpp/src/cuvs_runtime/random/common.cuh create mode 100644 cpp/src/cuvs_runtime/random/rmat_rectangular_generator_int64_double.cu create mode 100644 cpp/src/cuvs_runtime/random/rmat_rectangular_generator_int64_float.cu create mode 100644 cpp/src/cuvs_runtime/random/rmat_rectangular_generator_int_double.cu create mode 100644 cpp/src/cuvs_runtime/random/rmat_rectangular_generator_int_float.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu create mode 100644 cpp/src/distance/distance.cu create mode 100644 cpp/src/distance/fused_l2_nn.cu create mode 100644 cpp/src/matrix/detail/select_k_double_int64_t.cu create mode 100644 cpp/src/matrix/detail/select_k_double_uint32_t.cu create mode 100644 cpp/src/matrix/detail/select_k_float_int32.cu create mode 100644 cpp/src/matrix/detail/select_k_float_int64_t.cu create mode 100644 cpp/src/matrix/detail/select_k_float_uint32_t.cu create mode 100644 cpp/src/matrix/detail/select_k_half_int64_t.cu create mode 100644 cpp/src/matrix/detail/select_k_half_uint32_t.cu create mode 100644 cpp/src/neighbors/ball_cover.cu create mode 100644 cpp/src/neighbors/brute_force_00_generate.py create mode 100644 cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu create mode 100644 cpp/src/neighbors/brute_force_knn_index_float.cu create mode 100644 cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu create mode 100644 cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu create mode 100644 cpp/src/neighbors/brute_force_knn_int_float_int.cu create mode 100644 cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu create mode 100644 cpp/src/neighbors/detail/ivf_flat_search.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu create mode 100644 cpp/src/neighbors/detail/refine_host_float_float.cpp create mode 100644 cpp/src/neighbors/detail/refine_host_int8_t_float.cpp create mode 100644 cpp/src/neighbors/detail/refine_host_uint8_t_float.cpp create mode 100644 cpp/src/neighbors/detail/selection_faiss_00_generate.py create mode 100644 cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_int64_t_double.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_int64_t_half.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_int_double.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_long_float.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_size_t_double.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_size_t_float.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_uint32_t_double.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu create mode 100644 cpp/src/neighbors/detail/selection_faiss_uint32_t_half.cu create mode 100644 cpp/src/neighbors/ivf_flat_00_generate.py create mode 100644 cpp/src/neighbors/ivf_flat_build_float_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_search_float_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_build_float_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_extend_float_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_search_float_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu create mode 100644 cpp/src/neighbors/refine_00_generate.py create mode 100644 cpp/src/neighbors/refine_float_float.cu create mode 100644 cpp/src/neighbors/refine_int8_t_float.cu create mode 100644 cpp/src/neighbors/refine_uint8_t_float.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu create mode 100644 cpp/template/CMakeLists.txt create mode 100644 cpp/template/README.md create mode 100755 cpp/template/build.sh create mode 100644 cpp/template/cmake/thirdparty/fetch_rapids.cmake create mode 100644 cpp/template/cmake/thirdparty/get_cuvs.cmake create mode 100644 cpp/template/src/cagra_example.cu create mode 100644 cpp/template/src/common.cuh create mode 100644 cpp/template/src/ivf_flat_example.cu create mode 100644 cpp/test/CMakeLists.txt create mode 100644 cpp/test/cluster/cluster_solvers.cu create mode 100644 cpp/test/cluster/cluster_solvers_deprecated.cu create mode 100644 cpp/test/cluster/kmeans.cu create mode 100644 cpp/test/cluster/kmeans_balanced.cu create mode 100644 cpp/test/cluster/kmeans_find_k.cu create mode 100644 cpp/test/cluster/linkage.cu create mode 100644 cpp/test/distance/dist_adj.cu create mode 100644 cpp/test/distance/dist_adj.cuh create mode 100644 cpp/test/distance/dist_adj_distance_instance.cu create mode 100644 cpp/test/distance/dist_adj_threshold.cuh create mode 100644 cpp/test/distance/dist_canberra.cu create mode 100644 cpp/test/distance/dist_correlation.cu create mode 100644 cpp/test/distance/dist_cos.cu create mode 100644 cpp/test/distance/dist_hamming.cu create mode 100644 cpp/test/distance/dist_hellinger.cu create mode 100644 cpp/test/distance/dist_inner_product.cu create mode 100644 cpp/test/distance/dist_jensen_shannon.cu create mode 100644 cpp/test/distance/dist_kl_divergence.cu create mode 100644 cpp/test/distance/dist_l1.cu create mode 100644 cpp/test/distance/dist_l2_exp.cu create mode 100644 cpp/test/distance/dist_l2_sqrt_exp.cu create mode 100644 cpp/test/distance/dist_l2_unexp.cu create mode 100644 cpp/test/distance/dist_l_inf.cu create mode 100644 cpp/test/distance/dist_lp_unexp.cu create mode 100644 cpp/test/distance/dist_russell_rao.cu create mode 100644 cpp/test/distance/distance_base.cuh create mode 100644 cpp/test/distance/fused_l2_nn.cu create mode 100644 cpp/test/distance/gram.cu create mode 100644 cpp/test/distance/gram_base.cuh create mode 100644 cpp/test/distance/masked_nn.cu create mode 100644 cpp/test/distance/masked_nn_compress_to_bits.cu create mode 100644 cpp/test/ext_headers/00_generate.py create mode 100644 cpp/test/ext_headers/raft_core_logger.cpp create mode 100644 cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu create mode 100644 cpp/test/ext_headers/raft_distance_distance.cu create mode 100644 cpp/test/ext_headers/raft_distance_fused_l2_nn.cu create mode 100644 cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu create mode 100644 cpp/test/ext_headers/raft_matrix_detail_select_k.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_ball_cover.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_brute_force.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_ivf_flat.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_ivf_pq.cu create mode 100644 cpp/test/ext_headers/raft_neighbors_refine.cu create mode 100644 cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu create mode 100644 cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu create mode 100644 cpp/test/ext_headers/raft_util_memory_pool.cpp create mode 100644 cpp/test/neighbors/ann_cagra.cuh create mode 100644 cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh create mode 100644 cpp/test/neighbors/ann_cagra/test_float_int64_t.cu create mode 100644 cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_flat.cuh create mode 100644 cpp/test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_pq.cuh create mode 100644 cpp/test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu create mode 100644 cpp/test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu create mode 100644 cpp/test/neighbors/ann_nn_descent.cuh create mode 100644 cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_utils.cuh create mode 100644 cpp/test/neighbors/ball_cover.cu create mode 100644 cpp/test/neighbors/epsilon_neighborhood.cu create mode 100644 cpp/test/neighbors/fused_l2_knn.cu create mode 100644 cpp/test/neighbors/haversine.cu create mode 100644 cpp/test/neighbors/knn.cu create mode 100644 cpp/test/neighbors/knn_utils.cuh create mode 100644 cpp/test/neighbors/refine.cu create mode 100644 cpp/test/neighbors/selection.cu create mode 100644 cpp/test/neighbors/spatial_data.h create mode 100644 cpp/test/neighbors/tiled_knn.cu create mode 100644 cpp/test/sparse/dist_coo_spmv.cu create mode 100644 cpp/test/sparse/distance.cu create mode 100644 cpp/test/sparse/gram.cu create mode 100644 cpp/test/sparse/neighbors/brute_force.cu create mode 100644 cpp/test/sparse/neighbors/cross_component_nn.cu create mode 100644 cpp/test/sparse/neighbors/knn_graph.cu create mode 100644 cpp/test/sparse/spectral_matrix.cu create mode 100644 cpp/test/stats/accuracy.cu create mode 100644 cpp/test/stats/adjusted_rand_index.cu create mode 100644 cpp/test/stats/completeness_score.cu create mode 100644 cpp/test/stats/contingencyMatrix.cu create mode 100644 cpp/test/stats/cov.cu create mode 100644 cpp/test/stats/dispersion.cu create mode 100644 cpp/test/stats/entropy.cu create mode 100644 cpp/test/stats/histogram.cu create mode 100644 cpp/test/stats/homogeneity_score.cu create mode 100644 cpp/test/stats/information_criterion.cu create mode 100644 cpp/test/stats/kl_divergence.cu create mode 100644 cpp/test/stats/mean.cu create mode 100644 cpp/test/stats/meanvar.cu create mode 100644 cpp/test/stats/minmax.cu create mode 100644 cpp/test/stats/mutual_info_score.cu create mode 100644 cpp/test/stats/neighborhood_recall.cu create mode 100644 cpp/test/stats/r2_score.cu create mode 100644 cpp/test/stats/rand_index.cu create mode 100644 cpp/test/stats/regression_metrics.cu create mode 100644 cpp/test/stats/silhouette_score.cu create mode 100644 cpp/test/stats/stddev.cu create mode 100644 cpp/test/stats/sum.cu create mode 100644 cpp/test/stats/trustworthiness.cu create mode 100644 cpp/test/stats/v_measure.cu create mode 100644 cpp/test/stats/weighted_mean.cu create mode 100644 cpp/test/test.cpp create mode 100644 cpp/test/test_utils.cuh create mode 100644 cpp/test/test_utils.h create mode 100644 dependencies.yaml create mode 100644 docs/Makefile create mode 100644 docs/README.md create mode 100644 docs/make.bat create mode 100644 docs/source/_static/references.css create mode 100644 docs/source/ann_benchmarks_build.md create mode 100644 docs/source/ann_benchmarks_dataset.md create mode 100644 docs/source/ann_benchmarks_low_level.md create mode 100644 docs/source/ann_benchmarks_param_tuning.md create mode 100644 docs/source/build.md create mode 100644 docs/source/conf.py create mode 100755 docs/source/contributing.md create mode 100644 docs/source/cpp_api.rst create mode 100644 docs/source/cpp_api/cluster.rst create mode 100644 docs/source/cpp_api/cluster_kmeans.rst create mode 100644 docs/source/cpp_api/cluster_kmeans_balanced.rst create mode 100644 docs/source/cpp_api/cluster_slhc.rst create mode 100644 docs/source/cpp_api/cluster_spectral.rst create mode 100644 docs/source/cpp_api/distance.rst create mode 100644 docs/source/cpp_api/distance_1nn.rst create mode 100644 docs/source/cpp_api/distance_pairwise.rst create mode 100644 docs/source/cpp_api/neighbors.rst create mode 100644 docs/source/cpp_api/neighbors_ball_cover.rst create mode 100644 docs/source/cpp_api/neighbors_brute_force.rst create mode 100644 docs/source/cpp_api/neighbors_cagra.rst create mode 100644 docs/source/cpp_api/neighbors_epsilon_neighborhood.rst create mode 100644 docs/source/cpp_api/neighbors_ivf_flat.rst create mode 100644 docs/source/cpp_api/neighbors_ivf_pq.rst create mode 100644 docs/source/cpp_api/sparse.rst create mode 100644 docs/source/cpp_api/sparse_distance.rst create mode 100644 docs/source/cpp_api/sparse_neighbors.rst create mode 100644 docs/source/cpp_api/sparse_types.rst create mode 100644 docs/source/cpp_api/stats.rst create mode 100644 docs/source/cpp_api/stats_clustering.rst create mode 100644 docs/source/cpp_api/stats_neighborhood.rst create mode 100644 docs/source/cpp_api/stats_probability.rst create mode 100644 docs/source/developer_guide.md create mode 100644 docs/source/index.rst create mode 100644 docs/source/pylibraft_api.rst create mode 100644 docs/source/python_api/cluster.rst create mode 100644 docs/source/python_api/common.rst create mode 100644 docs/source/python_api/distance.rst create mode 100644 docs/source/python_api/matrix.rst create mode 100644 docs/source/python_api/neighbors.rst create mode 100644 docs/source/quick_start.md create mode 100644 docs/source/raft_ann_benchmarks.md create mode 100644 docs/source/sphinxext/github_link.py create mode 100644 docs/source/vector_search_tutorial.md create mode 100644 docs/source/wiki_all_dataset.md create mode 100644 fetch_rapids.cmake create mode 100644 img/raft-vector-search-batch-10.png create mode 100644 img/rapids_arrow.png create mode 100644 img/rapids_logo.png create mode 100644 notebooks/VectorSearch_QuestionRetrieval.ipynb create mode 100644 notebooks/ivf_flat_example.ipynb create mode 100644 notebooks/tutorial_ivf_pq.ipynb create mode 100644 notebooks/utils.py create mode 100644 pyproject.toml create mode 120000 python/cuda-ann-bench/LICENSE create mode 100644 python/cuda-ann-bench/pyproject.toml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/__init__.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/constraints/__init__.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/data_export/__main__.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/generate_groundtruth/__main__.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/generate_groundtruth/utils.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/get_dataset/__main__.py create mode 100755 python/cuda-ann-bench/src/cuda-ann-bench/get_dataset/fbin_to_f16bin.py create mode 100755 python/cuda-ann-bench/src/cuda-ann-bench/get_dataset/hdf5_to_fbin.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/plot/__main__.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/__main__.py create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/algos.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/hnswlib.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/raft_cagra.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/raft_ivf_flat.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/algos/raft_ivf_pq.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/bigann-100M.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/datasets.yaml create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/deep-100M.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/deep-1B.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/deep-image-96-inner.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/fashion-mnist-784-euclidean.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/gist-960-euclidean.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/glove-100-angular.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/glove-100-inner.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/glove-50-angular.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/glove-50-inner.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/lastfm-65-angular.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/mnist-784-euclidean.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/nytimes-256-angular.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/nytimes-256-inner.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/sift-128-euclidean.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/wiki_all_10M.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/wiki_all_1M.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/run/conf/wiki_all_88M.json create mode 100644 python/cuda-ann-bench/src/cuda-ann-bench/split_groundtruth/__main__.py create mode 100755 python/cuda-ann-bench/src/cuda-ann-bench/split_groundtruth/split_groundtruth.pl create mode 100644 python/cuvs/.coveragerc create mode 100644 python/cuvs/CMakeLists.txt create mode 120000 python/cuvs/LICENSE create mode 120000 python/cuvs/cuvs/VERSION create mode 100644 python/cuvs/cuvs/__init__.py create mode 100644 python/cuvs/cuvs/_version.py create mode 100644 python/cuvs/cuvs/cluster/CMakeLists.txt create mode 100644 python/cuvs/cuvs/cluster/__init__.pxd create mode 100644 python/cuvs/cuvs/cluster/__init__.py create mode 100644 python/cuvs/cuvs/cluster/cpp/__init__.pxd create mode 100644 python/cuvs/cuvs/cluster/cpp/__init__.py create mode 100644 python/cuvs/cuvs/cluster/cpp/kmeans.pxd create mode 100644 python/cuvs/cuvs/cluster/cpp/kmeans_types.pxd create mode 100644 python/cuvs/cuvs/cluster/kmeans.pyx create mode 100644 python/cuvs/cuvs/distance/CMakeLists.txt create mode 100644 python/cuvs/cuvs/distance/__init__.pxd create mode 100644 python/cuvs/cuvs/distance/__init__.py create mode 100644 python/cuvs/cuvs/distance/distance_type.pxd create mode 100644 python/cuvs/cuvs/distance/fused_l2_nn.pyx create mode 100644 python/cuvs/cuvs/distance/pairwise_distance.pyx create mode 100644 python/cuvs/cuvs/matrix/CMakeLists.txt create mode 100644 python/cuvs/cuvs/matrix/__init__.pxd create mode 100644 python/cuvs/cuvs/matrix/__init__.py create mode 100644 python/cuvs/cuvs/matrix/cpp/__init__.pxd create mode 100644 python/cuvs/cuvs/matrix/cpp/__init__.py create mode 100644 python/cuvs/cuvs/matrix/cpp/select_k.pxd create mode 100644 python/cuvs/cuvs/matrix/select_k.pyx create mode 100644 python/cuvs/cuvs/neighbors/CMakeLists.txt create mode 100644 python/cuvs/cuvs/neighbors/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/brute_force.pyx create mode 100644 python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt create mode 100644 python/cuvs/cuvs/neighbors/cagra/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/cagra/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/cagra/cagra.pyx create mode 100644 python/cuvs/cuvs/neighbors/cagra/cpp/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/cagra/cpp/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/cagra/cpp/c_cagra.pxd create mode 100644 python/cuvs/cuvs/neighbors/common.pxd create mode 100644 python/cuvs/cuvs/neighbors/common.pyx create mode 100644 python/cuvs/cuvs/neighbors/cpp/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/cpp/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/cpp/brute_force.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/cpp/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/cpp/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/cpp/c_ivf_flat.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/cpp/__init__.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/cpp/__init__.py create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/cpp/c_ivf_pq.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd create mode 100644 python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx create mode 100644 python/cuvs/cuvs/neighbors/refine.pyx create mode 100644 python/cuvs/cuvs/test/test_brute_force.py create mode 100644 python/cuvs/cuvs/test/test_cagra.py create mode 100644 python/cuvs/cuvs/test/test_device_ndarray.py create mode 100644 python/cuvs/cuvs/test/test_distance.py create mode 100644 python/cuvs/cuvs/test/test_doctests.py create mode 100644 python/cuvs/cuvs/test/test_fused_l2_argmin.py create mode 100644 python/cuvs/cuvs/test/test_ivf_flat.py create mode 100644 python/cuvs/cuvs/test/test_ivf_pq.py create mode 100644 python/cuvs/cuvs/test/test_kmeans.py create mode 100644 python/cuvs/cuvs/test/test_refine.py create mode 100644 python/cuvs/cuvs/test/test_select_k.py create mode 100644 python/cuvs/pyproject.toml create mode 100644 python/cuvs/setup.cfg create mode 100644 python/cuvs/setup.py create mode 100644 setup.cfg create mode 100644 thirdparty/LICENSES/LICENSE.ann-benchmark create mode 100644 thirdparty/LICENSES/LICENSE.faiss create mode 100644 thirdparty/LICENSES/LICENSE.pytorch create mode 100644 thirdparty/LICENSES/LICENSE_Date_Nagi diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..9d35e3f97 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,30 @@ +# syntax=docker/dockerfile:1.5 + +ARG BASE +ARG PYTHON_PACKAGE_MANAGER=conda + +FROM ${BASE} as pip-base + +ENV DEFAULT_VIRTUAL_ENV=rapids + +FROM ${BASE} as conda-base + +ENV DEFAULT_CONDA_ENV=rapids + +FROM ${PYTHON_PACKAGE_MANAGER}-base + +ARG CUDA +ENV CUDAARCHS="RAPIDS" +ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}" + +ARG PYTHON_PACKAGE_MANAGER +ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}" + +ENV PYTHONSAFEPATH="1" +ENV PYTHONUNBUFFERED="1" +ENV PYTHONDONTWRITEBYTECODE="1" + +ENV SCCACHE_REGION="us-east-2" +ENV SCCACHE_BUCKET="rapids-sccache-devs" +ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" +ENV HISTFILE="/home/coder/.cache/._bash_history" diff --git a/.devcontainer/README.md b/.devcontainer/README.md new file mode 100644 index 000000000..18390267a --- /dev/null +++ b/.devcontainer/README.md @@ -0,0 +1,64 @@ +# RAFT Development Containers + +This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces). + +This container is a turnkey development environment for building and testing the RAFT C++ and Python libraries. + +## Table of Contents + +* [Prerequisites](#prerequisites) +* [Host bind mounts](#host-bind-mounts) +* [Launch a Dev Container](#launch-a-dev-container) + +## Prerequisites + +* [VSCode](https://code.visualstudio.com/download) +* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) + +## Host bind mounts + +By default, the following directories are bind-mounted into the devcontainer: + +* `${repo}:/home/coder/cuvs` +* `${repo}/../.aws:/home/coder/.aws` +* `${repo}/../.local:/home/coder/.local` +* `${repo}/../.cache:/home/coder/.cache` +* `${repo}/../.conda:/home/coder/.conda` +* `${repo}/../.config:/home/coder/.config` + +This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs. + +## Launch a Dev Container + +To launch a devcontainer from VSCode, open the RAFT repo and select the "Reopen in Container" button in the bottom right:
+ +Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command. + +## Using the devcontainer + +On startup, the devcontainer creates or updates the conda/pip environment using `cuvs/dependencies.yaml`. + +The container includes convenience functions to clean, configure, and build the various RAFT components: + +```shell +$ clean-cuvs-cpp # only cleans the C++ build dir +$ clean-pylibcuvs-python # only cleans the Python build dir +$ clean-cuvs # cleans both C++ and Python build dirs + +$ configure-cuvs-cpp # only configures cuvs C++ lib + +$ build-cuvs-cpp # only builds cuvs C++ lib +$ build-pylibcuvs-python # only builds cuvs Python lib +$ build-cuvs # builds both C++ and Python libs +``` + +* The C++ build script is a small wrapper around `cmake -S ~/cuvs/cpp -B ~/cuvs/cpp/build` and `cmake --build ~/cuvs/cpp/build` +* The Python build script is a small wrapper around `pip install --editable ~/cuvs/cpp` + +Unlike `build.sh`, these convenience scripts *don't* install the libraries after building them. Instead, they automatically inject the correct arguments to build the C++ libraries from source and use their build dirs as package roots: + +```shell +$ cmake -S ~/cuvs/cpp -B ~/cuvs/cpp/build +$ CMAKE_ARGS="-Dcuvs_ROOT=~/cuvs/cpp/build" \ # <-- this argument is automatic + pip install -e ~/cuvs/cpp +``` diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json new file mode 100644 index 000000000..76ce8599a --- /dev/null +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -0,0 +1,37 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "11.8", + "PYTHON_PACKAGE_MANAGER": "conda", + "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json new file mode 100644 index 000000000..3a126b36e --- /dev/null +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -0,0 +1,38 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "11.8", + "PYTHON_PACKAGE_MANAGER": "pip", + "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"}, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/ucx", + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json new file mode 100644 index 000000000..426aaef98 --- /dev/null +++ b/.devcontainer/cuda12.0-conda/devcontainer.json @@ -0,0 +1,37 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "12.0", + "PYTHON_PACKAGE_MANAGER": "conda", + "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json new file mode 100644 index 000000000..1ef2fdcb6 --- /dev/null +++ b/.devcontainer/cuda12.0-pip/devcontainer.json @@ -0,0 +1,38 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "12.0", + "PYTHON_PACKAGE_MANAGER": "pip", + "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"}, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/ucx", + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..d2253c76f --- /dev/null +++ b/.flake8 @@ -0,0 +1,24 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +[flake8] +filename = *.py, *.pyx, *.pxd, *.pxi +exclude = __init__.py, *.egg, build, docs, .git +force-check = True +ignore = + # line break before binary operator + W503, + # whitespace before : + E203 +per-file-ignores = + # Rules ignored only in Cython: + # E211: whitespace before '(' (used in multi-line imports) + # E225: Missing whitespace around operators (breaks cython casting syntax like ) + # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) + # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) + # E275: Missing whitespace after keyword (Doesn't work with Cython except?) + # E402: invalid syntax (works for Python, not Cython) + # E999: invalid syntax (works for Python, not Cython) + # W504: line break after binary operator (breaks lines that end with a pointer) + *.pyx: E211, E225, E226, E227, E275, E402, E999, W504 + *.pxd: E211, E225, E226, E227, E275, E402, E999, W504 + *.pxi: E211, E225, E226, E227, E275, E402, E999, W504 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100755 index 000000000..407c5448e --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,21 @@ +#cpp code owners +cpp/ @rapidsai/cuvs-cpp-codeowners + +#python code owners +python/ @rapidsai/cuvs-python-codeowners + +#cmake code owners +**/CMakeLists.txt @rapidsai/cuvs-cmake-codeowners +**/cmake/ @rapidsai/cuvs-cmake-codeowners +python/setup.py @rapidsai/cuvs-cmake-codeowners +build.sh @rapidsai/cuvs-cmake-codeowners +**/build.sh @rapidsai/cuvs-cmake-codeowners + +#build/ops code owners +.github/ @rapidsai/ops-codeowners +ci/ @rapidsai/ops-codeowners +conda/ @rapidsai/ops-codeowners +**/Dockerfile @rapidsai/ops-codeowners +**/.dockerignore @rapidsai/ops-codeowners +docker/ @rapidsai/ops-codeowners +dependencies.yaml @rapidsai/ops-codeowners diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100755 index 000000000..bb9f1a280 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,26 @@ +--- +name: Bug report +about: Create a bug report to help us improve RAFT +title: "[BUG]" +labels: "? - Needs Triage, bug" +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**Steps/Code to reproduce bug** +Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Environment details (please complete the following information):** + - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)] + - Method of RAFT install: [conda, Docker, or from source] + - If method of install is [Docker], provide `docker pull` & `docker run` commands used + + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/documentation-request.md b/.github/ISSUE_TEMPLATE/documentation-request.md new file mode 100755 index 000000000..89a026f34 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation-request.md @@ -0,0 +1,35 @@ +--- +name: Documentation request +about: Report incorrect or needed documentation +title: "[DOC]" +labels: "? - Needs Triage, doc" +assignees: '' + +--- + +## Report incorrect documentation + +**Location of incorrect documentation** +Provide links and line numbers if applicable. + +**Describe the problems or issues found in the documentation** +A clear and concise description of what you found to be incorrect. + +**Steps taken to verify documentation is incorrect** +List any steps you have taken: + +**Suggested fix for documentation** +Detail proposed changes to fix the documentation if you have any. + +--- + +## Report needed documentation + +**Report needed documentation** +A clear and concise description of what documentation you believe it is needed and why. + +**Describe the documentation you'd like** +A clear and concise description of what you want to happen. + +**Steps taken to search for needed documentation** +List any steps you have taken: diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100755 index 000000000..9988a2a05 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for RAFT +title: "[FEA]" +labels: "? - Needs Triage, feature request" +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I wish I could use RAFT to do [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context, code examples, or references to existing implementations about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/submit-question.md b/.github/ISSUE_TEMPLATE/submit-question.md new file mode 100755 index 000000000..11a7f8ee2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/submit-question.md @@ -0,0 +1,10 @@ +--- +name: Submit question +about: Ask a general question about RAFT +title: "[QST]" +labels: "? - Needs Triage, question" +assignees: '' + +--- + +**What is your question?** diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100755 index 000000000..9c42cda72 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,44 @@ + diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 000000000..895ba83ee --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +# Configuration file for `copy-pr-bot` GitHub App +# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ + +enabled: true diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 000000000..56f77e69c --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,16 @@ +# https://github.com/actions/labeler#common-examples +# Adapted from https://github.com/rapidsai/raft/blob/main/.github/CODEOWNERS +# Labels culled from https://github.com/rapidsai/raft/labels + +python: + - 'python/**' + +cpp: + - 'cpp/**' + +CMake: + - '**/CMakeLists.txt' + - '**/cmake/**' + +ci: + - 'ci/**' diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml new file mode 100644 index 000000000..9a0b41550 --- /dev/null +++ b/.github/ops-bot.yaml @@ -0,0 +1,8 @@ +# This file controls which features from the `ops-bot` repository below are enabled. +# - https://github.com/rapidsai/ops-bot + +auto_merger: true +branch_checker: true +label_checker: true +release_drafter: true +recently_updated: true diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 000000000..f079b5e78 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,88 @@ +name: build + +on: + push: + branches: + - "branch-*" + tags: + - v[0-9][0-9].[0-9][0-9].[0-9][0-9] + workflow_dispatch: + inputs: + branch: + required: true + type: string + date: + required: true + type: string + sha: + required: true + type: string + build_type: + type: string + default: nightly + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + cpp-build: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + python-build: + needs: [cpp-build] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + upload-conda: + needs: [cpp-build, python-build] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + skip_upload_pkgs: libcuvs-template + docs-build: + if: github.ref_type == 'branch' + needs: python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 + with: + arch: "amd64" + branch: ${{ inputs.branch }} + build_type: ${{ inputs.build_type || 'branch' }} + container_image: "rapidsai/ci-conda:latest" + date: ${{ inputs.date }} + node_type: "gpu-v100-latest-1" + run_script: "ci/build_docs.sh" + sha: ${{ inputs.sha }} + wheel-build-cuvs: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + script: ci/build_wheel_cuvs.sh + wheel-publish-cuvs: + needs: wheel-build-cuvs + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: cuvs diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 000000000..55117f774 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,11 @@ +name: "Pull Request Labeler" +on: +- pull_request_target + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@main + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml new file mode 100644 index 000000000..b190a2a62 --- /dev/null +++ b/.github/workflows/pr.yaml @@ -0,0 +1,96 @@ +name: pr + +on: + push: + branches: + - "pull-request/[0-9]+" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + pr-builder: + needs: + - checks + - conda-cpp-build + - conda-cpp-tests + - conda-cpp-checks + - conda-python-build + - conda-python-tests + - docs-build + - wheel-build-cuvs + - wheel-tests-cuvs + - devcontainer + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02 + checks: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02 + with: + enable_check_generated_files: false + conda-cpp-build: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02 + with: + build_type: pull-request + node_type: cpu16 + conda-cpp-tests: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02 + with: + build_type: pull-request + conda-cpp-checks: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02 + with: + build_type: pull-request + enable_check_symbols: true + symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass) + conda-python-build: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02 + with: + build_type: pull-request + conda-python-tests: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02 + with: + build_type: pull-request + docs-build: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/build_docs.sh" + wheel-build-cuvs: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02 + with: + build_type: pull-request + script: ci/build_wheel_cuvs.sh + wheel-tests-cuvs: + needs: wheel-build-cuvs + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02 + with: + build_type: pull-request + script: ci/test_wheel_cuvs.sh + devcontainer: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02 + with: + build_command: | + sccache -z; + build-all -DBUILD_PRIMS_BENCH=ON -DBUILD_ANN_BENCH=ON --verbose; + sccache -s; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 000000000..acea5755f --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,51 @@ +name: test + +on: + workflow_dispatch: + inputs: + branch: + required: true + type: string + date: + required: true + type: string + sha: + required: true + type: string + +jobs: + conda-cpp-checks: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + enable_check_symbols: true + symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass) + conda-cpp-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + conda-python-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + wheel-tests-cuvs: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + script: ci/test_wheel_cuvs.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..8cac2b5ae --- /dev/null +++ b/.gitignore @@ -0,0 +1,67 @@ +## common +__pycache__ +*.pyc +*~ +\#* +.#* +*.o +*.so +*.dylib +.cache +.coverage +.vscode +*.swp +*.pytest_cache +htmlcov +build/ +build_prims/ +dist/ +python/**/**/*.cpp +python/cuvs/record.txt +log +.ipynb_checkpoints +.DS_Store +dask-worker-space/ +*.egg-info/ +*.bin +bench/ann/data +temporary_*.json + +## scikit-build +_skbuild + +## eclipse +.project +.cproject +.settings +.ptp-sync-folder + +## Pycharm +.idea + +## ccls +.ccls-cache +.ccls + +## profiling +*.qdrep +*.qdrep.cache +*.qdstrm +*.nvprof + +## doxygen build check inside ci/checks/style.sh +doxygen_check/ + +## cibuildwheel +/wheelhouse + +# doxygen +_xml + +# sphinx +_html +_text + +# clang tooling +compile_commands.json +.clangd/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..9e3b1a38b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,112 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +repos: + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + # Use the config file specific to each subproject so that each + # project can specify its own first/third-party packages. + args: ["--config-root=python/", "--resolve-all-configs"] + files: python/.* + types_or: [python, cython, pyi] + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + files: python/.* + # Explicitly specify the pyproject.toml at the repo root, not per-project. + args: ["--config", "pyproject.toml"] + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: ["--config=.flake8"] + files: python/.*$ + types: [file] + types_or: [python, cython] + additional_dependencies: ["flake8-force"] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v0.971' + hooks: + - id: mypy + additional_dependencies: [types-cachetools] + args: ["--config-file=pyproject.toml", + "python/cuvs/cuvs"] + pass_filenames: false + - repo: https://github.com/PyCQA/pydocstyle + rev: 6.1.1 + hooks: + - id: pydocstyle + # https://github.com/PyCQA/pydocstyle/issues/603 + additional_dependencies: [toml] + args: ["--config=pyproject.toml"] + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.6 + hooks: + - id: clang-format + types_or: [c, c++, cuda] + args: ["-fallback-style=none", "-style=file", "-i"] + - repo: local + hooks: + - id: no-deprecationwarning + name: no-deprecationwarning + description: 'Enforce that DeprecationWarning is not introduced (use FutureWarning instead)' + entry: '(category=|\s)DeprecationWarning[,)]' + language: pygrep + types_or: [python, cython] + - id: cmake-format + name: cmake-format + entry: ./cpp/scripts/run-cmake-format.sh cmake-format + language: python + types: [cmake] + exclude: .*/thirdparty/.*|.*FindAVX.cmake.* + # Note that pre-commit autoupdate does not update the versions + # of dependencies, so we'll have to update this manually. + additional_dependencies: + - cmakelang==0.6.13 + verbose: true + require_serial: true + - id: cmake-lint + name: cmake-lint + entry: ./cpp/scripts/run-cmake-format.sh cmake-lint + language: python + types: [cmake] + # Note that pre-commit autoupdate does not update the versions + # of dependencies, so we'll have to update this manually. + additional_dependencies: + - cmakelang==0.6.13 + verbose: true + require_serial: true + exclude: .*/thirdparty/.* + - id: copyright-check + name: copyright-check + entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year + language: python + pass_filenames: false + additional_dependencies: [gitpython] + - id: include-check + name: include-check + entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/test + pass_filenames: false + language: python + additional_dependencies: [gitpython] + - repo: https://github.com/codespell-project/codespell + rev: v2.2.2 + hooks: + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(^CHANGELOG.md$) + - repo: https://github.com/rapidsai/dependency-file-generator + rev: v1.5.1 + hooks: + - id: rapids-dependency-file-generator + args: ["--clean"] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-json + +default_language_version: + python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..e69de29bb diff --git a/LICENSE b/LICENSE new file mode 100755 index 000000000..1a89b9054 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2020 NVIDIA Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/VERSION b/VERSION new file mode 100644 index 000000000..3c6c5e2b7 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +24.02.00 diff --git a/build.sh b/build.sh new file mode 100755 index 000000000..ea69f8d0d --- /dev/null +++ b/build.sh @@ -0,0 +1,526 @@ +#!/bin/bash + +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +# cuvs build scripts + +# This script is used to build the component(s) in this repo from +# source, and can be called with various options to customize the +# build as needed (see the help output for details) + +# Abort script on first error +set -e + +NUMARGS=$# +ARGS=$* + +# NOTE: ensure all dir changes are relative to the location of this +# scripts, and that this script resides in the repo dir! +REPODIR=$(cd $(dirname $0); pwd) + +VALIDARGS="clean libcuvs cuvs docs tests template bench-prims bench-ann clean --uninstall -v -g -n --compile-lib --compile-static-lib --allgpuarch --no-nvtx --cpu-only --show_depr_warn --incl-cache-stats --time -h" +HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] [--limit-tests=] [--limit-bench-prims=] [--limit-bench-ann=] [--build-metrics=] + where is: + clean - remove all existing build artifacts and configuration (start over) + libcuvs - build the cuvs C++ code only. Also builds the C-wrapper library + around the C++ code. + cuvs - build the cuvs Python package + docs - build the documentation + tests - build the tests + bench-prims - build micro-benchmarks for primitives + bench-ann - build end-to-end ann benchmarks + template - build the example CUVS application template + + and is: + -v - verbose build mode + -g - build for debug + -n - no install step + --uninstall - uninstall files for specified targets which were built and installed prior + --compile-lib - compile shared library for all components + --compile-static-lib - compile static library for all components + --cpu-only - build CPU only components without CUDA. Applies to bench-ann only currently. + --limit-tests - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST) + --limit-bench-prims - semicolon-separated list of prims benchmark executables to compute (e.g. NEIGHBORS_PRIMS_BENCH;CLUSTER_PRIMS_BENCH) + --limit-bench-ann - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;CUVS_IVF_PQ_ANN_BENCH) + --allgpuarch - build for all supported GPU architectures + --no-nvtx - disable nvtx (profiling markers), but allow enabling it in downstream projects + --show_depr_warn - show cmake deprecation warnings + --build-metrics - filename for generating build metrics report for libcuvs + --incl-cache-stats - include cache statistics in build metrics report + --cmake-args=\\\"\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) + --cache-tool= - pass the build cache tool (eg: ccache, sccache, distcc) that will be used + to speedup the build process. + --time - Enable nvcc compilation time logging into cpp/build/nvcc_compile_log.csv. + Results can be interpreted with cpp/scripts/analyze_nvcc_log.py + -h - print this text + + default action (no args) is to build libcuvs, tests, cuvs and cuvs-dask targets +" +LIBCUVS_BUILD_DIR=${LIBCUVS_BUILD_DIR:=${REPODIR}/cpp/build} +SPHINX_BUILD_DIR=${REPODIR}/docs +DOXYGEN_BUILD_DIR=${REPODIR}/cpp/doxygen +CUVS_DASK_BUILD_DIR=${REPODIR}/python/cuvs-dask/_skbuild +PYLIBCUVS_BUILD_DIR=${REPODIR}/python/cuvs/_skbuild +BUILD_DIRS="${LIBCUVS_BUILD_DIR} ${PYLIBCUVS_BUILD_DIR} ${CUVS_DASK_BUILD_DIR}" + +# Set defaults for vars modified by flags to this script +CMAKE_LOG_LEVEL="" +VERBOSE_FLAG="" +BUILD_ALL_GPU_ARCH=0 +BUILD_TESTS=OFF +BUILD_TYPE=Release +BUILD_PRIMS_BENCH=OFF +BUILD_ANN_BENCH=OFF +BUILD_CPU_ONLY=OFF +COMPILE_LIBRARY=OFF +INSTALL_TARGET=install +BUILD_REPORT_METRICS="" +BUILD_REPORT_INCL_CACHE_STATS=OFF + +TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;NEIGHBORS_ANN_NN_DESCENT_TEST;NEIGHBORS_ANN_IVF_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST" +BENCH_TARGETS="CLUSTER_BENCH;CORE_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH" + +CACHE_ARGS="" +NVTX=ON +LOG_COMPILE_TIME=OFF +CLEAN=0 +UNINSTALL=0 +DISABLE_DEPRECATION_WARNINGS=ON +CMAKE_TARGET="" + +# Set defaults for vars that may not have been defined externally +INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX:=$LIBCUVS_BUILD_DIR/install}}} +PARALLEL_LEVEL=${PARALLEL_LEVEL:=`nproc`} +BUILD_ABI=${BUILD_ABI:=ON} + +# Default to Ninja if generator is not specified +export CMAKE_GENERATOR="${CMAKE_GENERATOR:=Ninja}" + +function hasArg { + (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") +} + +function cmakeArgs { + # Check for multiple cmake args options + if [[ $(echo $ARGS | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then + echo "Multiple --cmake-args options were provided, please provide only one: ${ARGS}" + exit 1 + fi + + # Check for cmake args option + if [[ -n $(echo $ARGS | { grep -E "\-\-cmake\-args" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + EXTRA_CMAKE_ARGS=$(echo $ARGS | { grep -Eo "\-\-cmake\-args=\".+\"" || true; }) + if [[ -n ${EXTRA_CMAKE_ARGS} ]]; then + # Remove the full EXTRA_CMAKE_ARGS argument from list of args so that it passes validArgs function + ARGS=${ARGS//$EXTRA_CMAKE_ARGS/} + # Filter the full argument down to just the extra string that will be added to cmake call + EXTRA_CMAKE_ARGS=$(echo $EXTRA_CMAKE_ARGS | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//') + fi + fi +} + +function cacheTool { + # Check for multiple cache options + if [[ $(echo $ARGS | { grep -Eo "\-\-cache\-tool" || true; } | wc -l ) -gt 1 ]]; then + echo "Multiple --cache-tool options were provided, please provide only one: ${ARGS}" + exit 1 + fi + # Check for cache tool option + if [[ -n $(echo $ARGS | { grep -E "\-\-cache\-tool" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + CACHE_TOOL=$(echo $ARGS | sed -e 's/.*--cache-tool=//' -e 's/ .*//') + if [[ -n ${CACHE_TOOL} ]]; then + # Remove the full CACHE_TOOL argument from list of args so that it passes validArgs function + ARGS=${ARGS//--cache-tool=$CACHE_TOOL/} + CACHE_ARGS="-DCMAKE_CUDA_COMPILER_LAUNCHER=${CACHE_TOOL} -DCMAKE_C_COMPILER_LAUNCHER=${CACHE_TOOL} -DCMAKE_CXX_COMPILER_LAUNCHER=${CACHE_TOOL}" + fi + fi +} + +function limitTests { + # Check for option to limit the set of test binaries to build + if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-tests" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + LIMIT_TEST_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-tests=//' -e 's/ .*//') + if [[ -n ${LIMIT_TEST_TARGETS} ]]; then + # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function + ARGS=${ARGS//--limit-tests=$LIMIT_TEST_TARGETS/} + TEST_TARGETS=${LIMIT_TEST_TARGETS} + echo "Limiting tests to $TEST_TARGETS" + fi + fi +} + +function limitBench { + # Check for option to limit the set of test binaries to build + if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-prims" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + LIMIT_PRIMS_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-prims=//' -e 's/ .*//') + if [[ -n ${LIMIT_PRIMS_BENCH_TARGETS} ]]; then + # Remove the full LIMIT_PRIMS_BENCH_TARGETS argument from list of args so that it passes validArgs function + ARGS=${ARGS//--limit-bench-prims=$LIMIT_PRIMS_BENCH_TARGETS/} + PRIMS_BENCH_TARGETS=${LIMIT_PRIMS_BENCH_TARGETS} + fi + fi +} + +function limitAnnBench { + # Check for option to limit the set of test binaries to build + if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-ann" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + LIMIT_ANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-ann=//' -e 's/ .*//') + if [[ -n ${LIMIT_ANN_BENCH_TARGETS} ]]; then + # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function + ARGS=${ARGS//--limit-bench-ann=$LIMIT_ANN_BENCH_TARGETS/} + ANN_BENCH_TARGETS=${LIMIT_ANN_BENCH_TARGETS} + fi + fi +} + +function buildMetrics { + # Check for multiple build-metrics options + if [[ $(echo $ARGS | { grep -Eo "\-\-build\-metrics" || true; } | wc -l ) -gt 1 ]]; then + echo "Multiple --build-metrics options were provided, please provide only one: ${ARGS}" + exit 1 + fi + # Check for build-metrics option + if [[ -n $(echo $ARGS | { grep -E "\-\-build\-metrics" || true; } ) ]]; then + # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently + # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back + # on the invalid option error + BUILD_REPORT_METRICS=$(echo $ARGS | sed -e 's/.*--build-metrics=//' -e 's/ .*//') + if [[ -n ${BUILD_REPORT_METRICS} ]]; then + # Remove the full BUILD_REPORT_METRICS argument from list of args so that it passes validArgs function + ARGS=${ARGS//--build-metrics=$BUILD_REPORT_METRICS/} + fi + fi +} + +if hasArg -h || hasArg --help; then + echo "${HELP}" + exit 0 +fi + +# Check for valid usage +if (( ${NUMARGS} != 0 )); then + cmakeArgs + cacheTool + limitTests + limitBench + limitAnnBench + buildMetrics + for a in ${ARGS}; do + if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then + echo "Invalid option: ${a}" + exit 1 + fi + done +fi + +# This should run before build/install +if hasArg --uninstall; then + UNINSTALL=1 + + if hasArg cuvs || hasArg libcuvs || (( ${NUMARGS} == 1 )); then + + echo "Removing libcuvs files..." + if [ -e ${LIBCUVS_BUILD_DIR}/install_manifest.txt ]; then + xargs rm -fv < ${LIBCUVS_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1 + fi + fi + + if hasArg cuvs || (( ${NUMARGS} == 1 )); then + echo "Uninstalling cuvs package..." + if [ -e ${PYLIBCUVS_BUILD_DIR}/install_manifest.txt ]; then + xargs rm -fv < ${PYLIBCUVS_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1 + fi + + # Try to uninstall via pip if it is installed + if [ -x "$(command -v pip)" ]; then + echo "Using pip to uninstall cuvs" + pip uninstall -y cuvs + + # Otherwise, try to uninstall through conda if that's where things are installed + elif [ -x "$(command -v conda)" ] && [ "$INSTALL_PREFIX" == "$CONDA_PREFIX" ]; then + echo "Using conda to uninstall cuvs" + conda uninstall -y cuvs + + # Otherwise, fail + else + echo "Could not uninstall cuvs from pip or conda. cuvs package will need to be manually uninstalled" + fi + fi + + if hasArg cuvs-dask || (( ${NUMARGS} == 1 )); then + echo "Uninstalling cuvs-dask package..." + if [ -e ${CUVS_DASK_BUILD_DIR}/install_manifest.txt ]; then + xargs rm -fv < ${CUVS_DASK_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1 + fi + + # Try to uninstall via pip if it is installed + if [ -x "$(command -v pip)" ]; then + echo "Using pip to uninstall cuvs-dask" + pip uninstall -y cuvs-dask + + # Otherwise, try to uninstall through conda if that's where things are installed + elif [ -x "$(command -v conda)" ] && [ "$INSTALL_PREFIX" == "$CONDA_PREFIX" ]; then + echo "Using conda to uninstall cuvs-dask" + conda uninstall -y cuvs-dask + + # Otherwise, fail + else + echo "Could not uninstall cuvs-dask from pip or conda. cuvs-dask package will need to be manually uninstalled." + fi + fi + exit 0 +fi + + +# Process flags +if hasArg -n; then + INSTALL_TARGET="" +fi + +if hasArg -v; then + VERBOSE_FLAG="-v" + CMAKE_LOG_LEVEL="VERBOSE" +fi +if hasArg -g; then + BUILD_TYPE=Debug +fi + +if hasArg --allgpuarch; then + BUILD_ALL_GPU_ARCH=1 +fi + +if hasArg --compile-lib || (( ${NUMARGS} == 0 )); then + COMPILE_LIBRARY=ON + CMAKE_TARGET="${CMAKE_TARGET};cuvs_lib" +fi + +if hasArg --compile-static-lib || (( ${NUMARGS} == 0 )); then + COMPILE_LIBRARY=ON + CMAKE_TARGET="${CMAKE_TARGET};cuvs_lib_static" +fi + +if hasArg tests || (( ${NUMARGS} == 0 )); then + BUILD_TESTS=ON + CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}" + + # Force compile library when needed test targets are specified + if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \ + $CMAKE_TARGET == *"DISTANCE_TEST"* || \ + $CMAKE_TARGET == *"MATRIX_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_ANN_IVF_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_ANN_NN_DESCENT_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_TEST"* || \ + $CMAKE_TARGET == *"SPARSE_DIST_TEST" || \ + $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \ + $CMAKE_TARGET == *"STATS_TEST"* ]]; then + echo "-- Enabling compiled lib for gtests" + COMPILE_LIBRARY=ON + fi +fi + +if hasArg bench-prims || (( ${NUMARGS} == 0 )); then + BUILD_PRIMS_BENCH=ON + CMAKE_TARGET="${CMAKE_TARGET};${PRIMS_BENCH_TARGETS}" + + # Force compile library when needed benchmark targets are specified + if [[ $CMAKE_TARGET == *"CLUSTER_PRIMS_BENCH"* || \ + $CMAKE_TARGET == *"MATRIX_PRIMS_BENCH"* || \ + $CMAKE_TARGET == *"NEIGHBORS_PRIMS_BENCH"* ]]; then + echo "-- Enabling compiled lib for benchmarks" + COMPILE_LIBRARY=ON + fi +fi + +if hasArg bench-ann || (( ${NUMARGS} == 0 )); then + BUILD_ANN_BENCH=ON + CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}" + if hasArg --cpu-only; then + COMPILE_LIBRARY=OFF + BUILD_CPU_ONLY=ON + NVTX=OFF + else + COMPILE_LIBRARY=ON + fi +fi + +if hasArg --no-nvtx; then + NVTX=OFF +fi +if hasArg --time; then + echo "-- Logging compile times to cpp/build/nvcc_compile_log.csv" + LOG_COMPILE_TIME=ON +fi +if hasArg --show_depr_warn; then + DISABLE_DEPRECATION_WARNINGS=OFF +fi +if hasArg clean; then + CLEAN=1 +fi +if hasArg --incl-cache-stats; then + BUILD_REPORT_INCL_CACHE_STATS=ON +fi +if [[ ${CMAKE_TARGET} == "" ]]; then + CMAKE_TARGET="all" +fi + +# Append `-DFIND_CUVS_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. +SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" +if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUVS_CPP"* ]]; then + SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_CUVS_CPP=ON" +fi + +# If clean given, run it prior to any other steps +if (( ${CLEAN} == 1 )); then + # If the dirs to clean are mounted dirs in a container, the + # contents should be removed but the mounted dirs will remain. + # The find removes all contents but leaves the dirs, the rmdir + # attempts to remove the dirs but can fail safely. + for bd in ${BUILD_DIRS}; do + if [ -d ${bd} ]; then + find ${bd} -mindepth 1 -delete + rmdir ${bd} || true + fi + done +fi + +################################################################################ +# Configure for building all C++ targets +if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then + if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then + CUVS_CMAKE_CUDA_ARCHITECTURES="NATIVE" + echo "Building for the architecture of the GPU in the system..." + else + CUVS_CMAKE_CUDA_ARCHITECTURES="RAPIDS" + echo "Building for *ALL* supported GPU architectures..." + fi + + # get the current count before the compile starts + CACHE_TOOL=${CACHE_TOOL:-sccache} + if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v ${CACHE_TOOL})" ]]; then + "${CACHE_TOOL}" --zero-stats + fi + + mkdir -p ${LIBCUVS_BUILD_DIR} + cd ${LIBCUVS_BUILD_DIR} + cmake -S ${REPODIR}/cpp -B ${LIBCUVS_BUILD_DIR} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DCMAKE_CUDA_ARCHITECTURES=${CUVS_CMAKE_CUDA_ARCHITECTURES} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCUVS_COMPILE_LIBRARY=${COMPILE_LIBRARY} \ + -DCUVS_NVTX=${NVTX} \ + -DCUDA_LOG_COMPILE_TIME=${LOG_COMPILE_TIME} \ + -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \ + -DBUILD_TESTS=${BUILD_TESTS} \ + -DBUILD_PRIMS_BENCH=${BUILD_PRIMS_BENCH} \ + -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \ + -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \ + -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \ + ${CACHE_ARGS} \ + ${EXTRA_CMAKE_ARGS} + + compile_start=$(date +%s) + if [[ ${CMAKE_TARGET} != "" ]]; then + echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}" + if [[ ${INSTALL_TARGET} != "" ]]; then + cmake --build "${LIBCUVS_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET} + else + cmake --build "${LIBCUVS_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} + fi + fi + compile_end=$(date +%s) + compile_total=$(( compile_end - compile_start )) + + if [[ -n "$BUILD_REPORT_METRICS" && -f "${LIBCUVS_BUILD_DIR}/.ninja_log" ]]; then + if ! rapids-build-metrics-reporter.py 2> /dev/null && [ ! -f rapids-build-metrics-reporter.py ]; then + echo "Downloading rapids-build-metrics-reporter.py" + curl -sO https://raw.githubusercontent.com/rapidsai/build-metrics-reporter/v1/rapids-build-metrics-reporter.py + fi + + echo "Formatting build metrics" + MSG="" + # get some sccache/ccache stats after the compile + if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" ]]; then + if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then + COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }') + CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }') + HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}") + MSG="${MSG}
cache hit rate ${HIT_RATE} %" + elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then + CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1) + if [[ ! -z "$CACHE_STATS_LINE" ]]; then + CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }') + COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }') + HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}") + MSG="${MSG}
cache hit rate ${HIT_RATE} %" + fi + fi + fi + MSG="${MSG}
parallel setting: $PARALLEL_LEVEL" + MSG="${MSG}
parallel build time: $compile_total seconds" + if [[ -f "${LIBCUVS_BUILD_DIR}/libcuvs.so" ]]; then + LIBCUVS_FS=$(ls -lh ${LIBCUVS_BUILD_DIR}/libcuvs.so | awk '{print $5}') + MSG="${MSG}
libcuvs.so size: $LIBCUVS_FS" + fi + BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIBCUVS_BUILD_DIR}"} + echo "The HTML report can be found at [${BMR_DIR}/${BUILD_REPORT_METRICS}.html]. In CI, this report" + echo "will also be uploaded to the appropriate subdirectory of https://downloads.rapids.ai/ci/cuvs/, and" + echo "the entire URL can be found in \"conda-cpp-build\" runs under the task \"Upload additional artifacts\"" + mkdir -p ${BMR_DIR} + MSG_OUTFILE="$(mktemp)" + echo "$MSG" > "${MSG_OUTFILE}" + PATH=".:$PATH" python rapids-build-metrics-reporter.py ${LIBCUVS_BUILD_DIR}/.ninja_log --fmt html --msg "${MSG_OUTFILE}" > ${BMR_DIR}/${BUILD_REPORT_METRICS}.html + cp ${LIBCUVS_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log + fi +fi + +# Build and (optionally) install the cuvs Python package +if (( ${NUMARGS} == 0 )) || hasArg cuvs; then + SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \ + python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/cuvs +fi + +# Build and (optionally) install the cuvs-dask Python package +if (( ${NUMARGS} == 0 )) || hasArg cuvs-dask; then + SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \ + python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/cuvs-dask +fi + +# Build and (optionally) install the cuvs-ann-bench Python package +if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then + python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/cuvs-ann-bench -vvv +fi + +if hasArg docs; then + set -x + cd ${DOXYGEN_BUILD_DIR} + doxygen Doxyfile + cd ${SPHINX_BUILD_DIR} + sphinx-build -b html source _html +fi + +################################################################################ +# Initiate build for example CUVS application template (if needed) + +if hasArg template; then + pushd ${REPODIR}/cpp/template + ./build.sh + popd +fi diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh new file mode 100755 index 000000000..9583b79bd --- /dev/null +++ b/ci/build_cpp.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +set -euo pipefail + +source rapids-env-update + +export CMAKE_GENERATOR=Ninja + +rapids-print-env + +version=$(rapids-generate-version) + +rapids-logger "Begin cpp build" + +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild conda/recipes/libcuvs + +rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh new file mode 100755 index 000000000..4538f3522 --- /dev/null +++ b/ci/build_docs.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create test conda environment" +. /opt/conda/etc/profile.d/conda.sh + +rapids-dependency-file-generator \ + --output conda \ + --file_key docs \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + +rapids-mamba-retry env create --force -f env.yaml -n docs +conda activate docs + +rapids-print-env + +rapids-logger "Downloading artifacts from previous jobs" +CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) + +rapids-mamba-retry install \ + --channel "${CPP_CHANNEL}" \ + --channel "${PYTHON_CHANNEL}" \ + libcuvs \ + libcuvs-headers \ + cuvs \ + raft-dask + +export RAPIDS_VERSION_NUMBER="24.02" +export RAPIDS_DOCS_DIR="$(mktemp -d)" + +rapids-logger "Build CPP docs" +pushd cpp/doxygen +doxygen Doxyfile +popd + +rapids-logger "Build Python docs" +pushd docs +sphinx-build -b dirhtml source _html +sphinx-build -b text source _text +mkdir -p "${RAPIDS_DOCS_DIR}/raft/"{html,txt} +mv _html/* "${RAPIDS_DOCS_DIR}/raft/html" +mv _text/* "${RAPIDS_DOCS_DIR}/raft/txt" +popd + +rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh new file mode 100755 index 000000000..c35e39f98 --- /dev/null +++ b/ci/build_python.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +set -euo pipefail + +source rapids-env-update + +export CMAKE_GENERATOR=Ninja + +rapids-print-env + +rapids-logger "Begin py build" + +CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) + +version=$(rapids-generate-version) +git_commit=$(git rev-parse HEAD) +export RAPIDS_PACKAGE_VERSION=${version} +echo "${version}" > VERSION + +package_dir="python" +for package_name in cuvs raft-dask; do + underscore_package_name=$(echo "${package_name}" | tr "-" "_") + sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/${underscore_package_name}/_version.py" +done + +# TODO: Remove `--no-test` flags once importing on a CPU +# node works correctly +rapids-conda-retry mambabuild \ + --no-test \ + --channel "${CPP_CHANNEL}" \ + conda/recipes/cuvs + + +# Build ann-bench for each cuda and python version +rapids-conda-retry mambabuild \ +--no-test \ +--channel "${CPP_CHANNEL}" \ +--channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ +conda/recipes/cuda-ann-bench + +# Build ann-bench-cpu only in CUDA 11 jobs since it only depends on python +# version +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then + rapids-conda-retry mambabuild \ + --no-test \ + --channel "${CPP_CHANNEL}" \ + --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ + conda/recipes/cuda-ann-bench-cpu +fi + +rapids-upload-conda-to-s3 python diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh new file mode 100755 index 000000000..118369fcf --- /dev/null +++ b/ci/build_wheel.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +package_name=$1 +package_dir=$2 +underscore_package_name=$(echo "${package_name}" | tr "-" "_") + +source rapids-configure-sccache +source rapids-date-string + +version=$(rapids-generate-version) +git_commit=$(git rev-parse HEAD) + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + +# This is the version of the suffix with a preceding hyphen. It's used +# everywhere except in the final wheel name. +PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" + +# Patch project metadata files to include the CUDA version suffix and version override. +pyproject_file="${package_dir}/pyproject.toml" +version_file="${package_dir}/${underscore_package_name}/_version.py" + +sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +echo "${version}" > VERSION +sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file} + +# For nightlies we want to ensure that we're pulling in alphas as well. The +# easiest way to do so is to augment the spec with a constraint containing a +# min alpha version that doesn't affect the version bounds but does allow usage +# of alpha versions for that dependency without --pre +alpha_spec='' +if ! rapids-is-release-build; then + alpha_spec=',>=0.0.0a0' +fi + +if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then + sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} + sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} +fi + +cd "${package_dir}" + +# Hardcode the output dir +python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check + +mkdir -p final_dist +python -m auditwheel repair -w final_dist dist/* + +RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh new file mode 100755 index 000000000..9d2f96996 --- /dev/null +++ b/ci/build_wheel_cuvs.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +# Set up skbuild options. Enable sccache in skbuild config options +export SKBUILD_CONFIGURE_OPTIONS="-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + +ci/build_wheel.sh cuvs python/cuvs diff --git a/ci/check_style.sh b/ci/check_style.sh new file mode 100755 index 000000000..0ee6e88e5 --- /dev/null +++ b/ci/check_style.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create checks conda environment" +. /opt/conda/etc/profile.d/conda.sh + +rapids-dependency-file-generator \ + --output conda \ + --file_key checks \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + +rapids-mamba-retry env create --force -f env.yaml -n checks +conda activate checks + +# Run pre-commit checks +pre-commit run --all-files --show-diff-on-failure diff --git a/ci/checks/black_lists.sh b/ci/checks/black_lists.sh new file mode 100755 index 000000000..ed61527c3 --- /dev/null +++ b/ci/checks/black_lists.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +########################################## +# RAFT black listed function call Tester # +########################################## + +# PR_TARGET_BRANCH is set by the CI environment + +git checkout --quiet $PR_TARGET_BRANCH + +# Switch back to tip of PR branch +git checkout --quiet current-pr-branch + +# Ignore errors during searching +set +e + +# Disable history expansion to enable use of ! in perl regex +set +H + +RETVAL=0 + +for black_listed in cudaDeviceSynchronize cudaMalloc cudaMallocManaged cudaFree cudaMallocHost cudaHostAlloc cudaFreeHost; do + TMP=`git --no-pager diff --ignore-submodules -w --minimal -U0 -S"$black_listed" $PR_TARGET_BRANCH | grep '^+' | grep -v '^+++' | grep "$black_listed"` + if [ "$TMP" != "" ]; then + for filename in `git --no-pager diff --ignore-submodules -w --minimal --name-only -S"$black_listed" $PR_TARGET_BRANCH`; do + basefilename=$(basename -- "$filename") + filext="${basefilename##*.}" + if [ "$filext" != "md" ] && [ "$filext" != "sh" ]; then + TMP2=`git --no-pager diff --ignore-submodules -w --minimal -U0 -S"$black_listed" $PR_TARGET_BRANCH -- $filename | grep '^+' | grep -v '^+++' | grep "$black_listed" | grep -vE "^\+[[:space:]]*/{2,}.*$black_listed"` + if [ "$TMP2" != "" ]; then + echo "=== ERROR: black listed function call $black_listed added to $filename ===" + git --no-pager diff --ignore-submodules -w --minimal -S"$black_listed" $PR_TARGET_BRANCH -- $filename + echo "=== END ERROR ===" + RETVAL=1 + fi + fi + done + fi +done + +for cond_black_listed in cudaMemcpy cudaMemset; do + TMP=`git --no-pager diff --ignore-submodules -w --minimal -U0 -S"$cond_black_listed" $PR_TARGET_BRANCH | grep '^+' | grep -v '^+++' | grep -P "$cond_black_listed(?!Async)"` + + if [ "$TMP" != "" ]; then + for filename in `git --no-pager diff --ignore-submodules -w --minimal --name-only -S"$cond_black_listed" $PR_TARGET_BRANCH`; do + basefilename=$(basename -- "$filename") + filext="${basefilename##*.}" + if [ "$filext" != "md" ] && [ "$filext" != "sh" ]; then + TMP2=`git --no-pager diff --ignore-submodules -w --minimal -U0 -S"$cond_black_listed" $PR_TARGET_BRANCH -- $filename | grep '^+' | grep -v '^+++' | grep -P "$cond_black_listed(?!Async)" | grep -vE "^\+[[:space:]]*/{2,}.*$cond_black_listed"` + if [ "$TMP2" != "" ]; then + echo "=== ERROR: black listed function call $cond_black_listed added to $filename ===" + git --no-pager diff --ignore-submodules -w --minimal -S"$cond_black_listed" $PR_TARGET_BRANCH -- $filename + echo "=== END ERROR ===" + RETVAL=1 + fi + fi + done + fi +done + +exit $RETVAL diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py new file mode 100644 index 000000000..a49f27cd2 --- /dev/null +++ b/ci/checks/copyright.py @@ -0,0 +1,289 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import re +import argparse +import io +import os +import sys + +import git + +SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + +# Add the scripts dir for gitutils +sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, + "../../cpp/scripts"))) + +# Now import gitutils. Ignore flake8 error here since there is no other way to +# set up imports +import gitutils # noqa: E402 + +FilesToCheck = [ + re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), + re.compile(r"CMakeLists[.]txt$"), + re.compile(r"CMakeLists_standalone[.]txt$"), + re.compile(r"setup[.]cfg$"), + re.compile(r"meta[.]yaml$") +] +ExemptFiles = [ + re.compile("cpp/include/cuvs/neighbors/detail/faiss_select/"), + re.compile("docs/source/sphinxext/github_link.py"), + re.compile("cpp/cmake/modules/FindAVX.cmake") +] + +# this will break starting at year 10000, which is probably OK :) +CheckSimple = re.compile( + r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)") +CheckDouble = re.compile( + r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" # noqa: E501 +) + + +def checkThisFile(f): + if isinstance(f, git.Diff): + if f.deleted_file or f.b_blob.size == 0: + return False + f = f.b_path + elif not os.path.exists(f) or os.stat(f).st_size == 0: + # This check covers things like symlinks which point to files that DNE + return False + for exempt in ExemptFiles: + if exempt.search(f): + return False + for checker in FilesToCheck: + if checker.search(f): + return True + return False + + +def modifiedFiles(): + """Get a set of all modified files, as Diff objects. + + The files returned have been modified in git since the merge base of HEAD + and the upstream of the target branch. We return the Diff objects so that + we can read only the staged changes. + """ + repo = git.Repo() + # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible + target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH")) + if target_branch is None: + # Fall back to the closest branch if not on CI + target_branch = repo.git.describe( + all=True, tags=True, match="branch-*", abbrev=0 + ).lstrip("heads/") + + upstream_target_branch = None + if target_branch in repo.heads: + # Use the tracking branch of the local reference if it exists. This + # returns None if no tracking branch is set. + upstream_target_branch = repo.heads[target_branch].tracking_branch() + if upstream_target_branch is None: + # Fall back to the remote with the newest target_branch. This code + # path is used on CI because the only local branch reference is + # current-pr-branch, and thus target_branch is not in repo.heads. + # This also happens if no tracking branch is defined for the local + # target_branch. We use the remote with the latest commit if + # multiple remotes are defined. + candidate_branches = [ + remote.refs[target_branch] for remote in repo.remotes + if target_branch in remote.refs + ] + if len(candidate_branches) > 0: + upstream_target_branch = sorted( + candidate_branches, + key=lambda branch: branch.commit.committed_datetime, + )[-1] + else: + # If no remotes are defined, try to use the local version of the + # target_branch. If this fails, the repo configuration must be very + # strange and we can fix this script on a case-by-case basis. + upstream_target_branch = repo.heads[target_branch] + merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0] + diff = merge_base.diff() + changed_files = {f for f in diff if f.b_path is not None} + return changed_files + + +def getCopyrightYears(line): + res = CheckSimple.search(line) + if res: + return int(res.group(1)), int(res.group(1)) + res = CheckDouble.search(line) + if res: + return int(res.group(1)), int(res.group(2)) + return None, None + + +def replaceCurrentYear(line, start, end): + # first turn a simple regex into double (if applicable). then update years + res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) + res = CheckDouble.sub( + rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION", + res, + ) + return res + + +def checkCopyright(f, update_current_year): + """Checks for copyright headers and their years.""" + errs = [] + thisYear = datetime.datetime.now().year + lineNum = 0 + crFound = False + yearMatched = False + + if isinstance(f, git.Diff): + path = f.b_path + lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True) + else: + path = f + with open(f, encoding="utf-8") as fp: + lines = fp.readlines() + + for line in lines: + lineNum += 1 + start, end = getCopyrightYears(line) + if start is None: + continue + crFound = True + if start > end: + e = [ + path, + lineNum, + "First year after second year in the copyright " + "header (manual fix required)", + None, + ] + errs.append(e) + elif thisYear < start or thisYear > end: + e = [ + path, + lineNum, + "Current year not included in the copyright header", + None, + ] + if thisYear < start: + e[-1] = replaceCurrentYear(line, thisYear, end) + if thisYear > end: + e[-1] = replaceCurrentYear(line, start, thisYear) + errs.append(e) + else: + yearMatched = True + # copyright header itself not found + if not crFound: + e = [ + path, + 0, + "Copyright header missing or formatted incorrectly " + "(manual fix required)", + None, + ] + errs.append(e) + # even if the year matches a copyright header, make the check pass + if yearMatched: + errs = [] + + if update_current_year: + errs_update = [x for x in errs if x[-1] is not None] + if len(errs_update) > 0: + lines_changed = ", ".join(str(x[1]) for x in errs_update) + print(f"File: {path}. Changing line(s) {lines_changed}") + for _, lineNum, __, replacement in errs_update: + lines[lineNum - 1] = replacement + with open(path, "w", encoding="utf-8") as out_file: + out_file.writelines(lines) + + return errs + + +def getAllFilesUnderDir(root, pathFilter=None): + retList = [] + for dirpath, dirnames, filenames in os.walk(root): + for fn in filenames: + filePath = os.path.join(dirpath, fn) + if pathFilter(filePath): + retList.append(filePath) + return retList + + +def checkCopyright_main(): + """ + Checks for copyright headers in all the modified files. In case of local + repo, this script will just look for uncommitted files and in case of CI + it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" + """ + retVal = 0 + + argparser = argparse.ArgumentParser( + "Checks for a consistent copyright header in git's modified files" + ) + argparser.add_argument( + "--update-current-year", + dest="update_current_year", + action="store_true", + required=False, + help="If set, " + "update the current year if a header is already " + "present and well formatted.", + ) + argparser.add_argument( + "--git-modified-only", + dest="git_modified_only", + action="store_true", + required=False, + help="If set, " + "only files seen as modified by git will be " + "processed.", + ) + + args, dirs = argparser.parse_known_args() + + if args.git_modified_only: + files = [f for f in modifiedFiles() if checkThisFile(f)] + else: + files = [] + for d in [os.path.abspath(d) for d in dirs]: + if not os.path.isdir(d): + raise ValueError(f"{d} is not a directory.") + files += getAllFilesUnderDir(d, pathFilter=checkThisFile) + + errors = [] + for f in files: + errors += checkCopyright(f, args.update_current_year) + + if len(errors) > 0: + if any(e[-1] is None for e in errors): + print("Copyright headers incomplete in some of the files!") + for e in errors: + print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) + print("") + n_fixable = sum(1 for e in errors if e[-1] is not None) + path_parts = os.path.abspath(__file__).split(os.sep) + file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :]) + if n_fixable > 0 and not args.update_current_year: + print( + f"You can run `python {file_from_repo} --git-modified-only " + "--update-current-year` and stage the results in git to " + f"fix {n_fixable} of these errors.\n" + ) + retVal = 1 + + return retVal + + +if __name__ == "__main__": + sys.exit(checkCopyright_main()) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh new file mode 100755 index 000000000..f463aeb65 --- /dev/null +++ b/ci/release/update-version.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +######################## +# RAFT Version Updater # +######################## + +## Usage +# bash update-version.sh + + +# Format is YY.MM.PP - no leading 'v' or trailing 'a' +NEXT_FULL_TAG=$1 + +# Get current version +CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v') +CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}') +CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}') +CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}') +CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR} + +#Get . for next version +NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') +NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') +NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} +NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})" +NEXT_UCX_PY_VERSION="${NEXT_UCX_PY_SHORT_TAG}.*" + +# Need to distutils-normalize the original version +NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") +NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))") + +echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" + +# Inplace sed replace; workaround for Linux and Mac +function sed_runner() { + sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak +} + +sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/CMakeLists.txt +sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake +sed_runner "s/set(RAFT_VERSION .*)/set(RAFT_VERSION \"${NEXT_FULL_TAG}\")/g" cpp/CMakeLists.txt +sed_runner 's/'"cuvs_version .*)"'/'"cuvs_version ${NEXT_FULL_TAG})"'/g' python/cuvs/CMakeLists.txt +sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake + +# Centralized version file update +echo "${NEXT_FULL_TAG}" > VERSION + +# Wheel testing script +sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh + +# Docs update +sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py +sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py + +DEPENDENCIES=( + dask-cuda + cuvs + cuvs-cu11 + cuvs-cu12 + rmm + rmm-cu11 + rmm-cu12 + rapids-dask-dependency + # ucx-py is handled separately below +) +for FILE in dependencies.yaml conda/environments/*.yaml; do + for DEP in "${DEPENDENCIES[@]}"; do + sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*/g" ${FILE}; + done + sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; +done +for FILE in python/*/pyproject.toml; do + for DEP in "${DEPENDENCIES[@]}"; do + sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE} + done + sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE} +done + +sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml + +for FILE in .github/workflows/*.yaml; do + sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" +done +sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh + +sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxyfile + +sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md +sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md +sed_runner "/rapidsai\/raft/ s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/developer_guide.md + +sed_runner "s|:[0-9][0-9].[0-9][0-9]|:${NEXT_SHORT_TAG}|g" docs/source/raft_ann_benchmarks.md + +sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" README.md + +# .devcontainer files +find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do + sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" + sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}" + sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" +done diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh new file mode 100755 index 000000000..95ae6a69b --- /dev/null +++ b/ci/test_cpp.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +rapids-logger "Generate C++ testing dependencies" +rapids-dependency-file-generator \ + --output conda \ + --file_key test_cpp \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml + +rapids-mamba-retry env create --force -f env.yaml -n test + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate test +set -u + +CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ +mkdir -p "${RAPIDS_TESTS_DIR}" + +rapids-print-env + +rapids-mamba-retry install \ + --channel "${CPP_CHANNEL}" \ + libcuvs libcuvs-tests + +rapids-logger "Check GPU usage" +nvidia-smi + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + +# Run libcuvs gtests from libcuvs-tests package +cd "$CONDA_PREFIX"/bin/gtests/libcuvs +ctest -j8 --output-on-failure + +rapids-logger "Test script exiting with value: $EXITCODE" +exit ${EXITCODE} diff --git a/ci/test_python.sh b/ci/test_python.sh new file mode 100755 index 000000000..a65469928 --- /dev/null +++ b/ci/test_python.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +rapids-logger "Generate Python testing dependencies" +rapids-dependency-file-generator \ + --output conda \ + --file_key test_python \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + +rapids-mamba-retry env create --force -f env.yaml -n test + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate test +set -u + +rapids-logger "Downloading artifacts from previous jobs" +CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) + +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}" + +rapids-print-env + +rapids-mamba-retry install \ + --channel "${CPP_CHANNEL}" \ + --channel "${PYTHON_CHANNEL}" \ + libcuvs cuvs + +rapids-logger "Check GPU usage" +nvidia-smi + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + +rapids-logger "pytest cuvs" +pushd python/cuvs/cuvs +pytest \ + --cache-clear \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cuvs.xml" \ + --cov-config=../.coveragerc \ + --cov=cuvs \ + --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cuvs-coverage.xml" \ + --cov-report=term \ + test +popd + +rapids-logger "Test script exiting with value: $EXITCODE" +exit ${EXITCODE} diff --git a/ci/test_wheel_cuvs.sh b/ci/test_wheel_cuvs.sh new file mode 100755 index 000000000..6b213d399 --- /dev/null +++ b/ci/test_wheel_cuvs.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +mkdir -p ./dist +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install $(echo ./dist/cuvs*.whl)[test] + +# Run smoke tests for aarch64 pull requests +if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then + python ./ci/wheel_smoke_test_cuvs.py +else + python -m pytest ./python/cuvs/cuvs/test +fi diff --git a/ci/wheel_smoke_test_cuvs.py b/ci/wheel_smoke_test_cuvs.py new file mode 100644 index 000000000..65b5fb8b2 --- /dev/null +++ b/ci/wheel_smoke_test_cuvs.py @@ -0,0 +1,53 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +from scipy.spatial.distance import cdist + +from pylibrat.common import Handle, Stream, device_ndarray +from cuvs.distance import pairwise_distance + + +if __name__ == "__main__": + metric = "euclidean" + n_rows = 1337 + n_cols = 1337 + + input1 = np.random.random_sample((n_rows, n_cols)) + input1 = np.asarray(input1, order="C").astype(np.float64) + + output = np.zeros((n_rows, n_rows), dtype=np.float64) + + expected = cdist(input1, input1, metric) + + expected[expected <= 1e-5] = 0.0 + + input1_device = device_ndarray(input1) + output_device = None + + s2 = Stream() + handle = Handle(stream=s2) + ret_output = pairwise_distance( + input1_device, input1_device, output_device, metric, handle=handle + ) + handle.sync() + + output_device = ret_output + + actual = output_device.copy_to_host() + + actual[actual <= 1e-5] = 0.0 + + assert np.allclose(expected, actual, rtol=1e-4) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml new file mode 100644 index 000000000..dbf92ec47 --- /dev/null +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -0,0 +1,54 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-nvtx=11.8 +- cuda-profiler-api=11.8.86 +- cuda-python>=11.7.1,<12.0a0 +- cuda-version=11.8 +- cudatoolkit +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- doxygen>=1.8.20 +- gcc_linux-aarch64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- nccl>=2.9.9 +- ninja +- numpy>=1.21 +- numpydoc +- nvcc_linux-aarch64=11.8 +- pre-commit +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==24.2.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-aarch64==2.17 +name: all_cuda-118_arch-aarch64 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml new file mode 100644 index 000000000..2fe184f96 --- /dev/null +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -0,0 +1,54 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-nvtx=11.8 +- cuda-profiler-api=11.8.86 +- cuda-python>=11.7.1,<12.0a0 +- cuda-version=11.8 +- cudatoolkit +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- doxygen>=1.8.20 +- gcc_linux-64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- nccl>=2.9.9 +- ninja +- numpy>=1.21 +- numpydoc +- nvcc_linux-64=11.8 +- pre-commit +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==24.2.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-64==2.17 +name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml new file mode 100644 index 000000000..1b7f3908a --- /dev/null +++ b/conda/environments/all_cuda-120_arch-aarch64.yaml @@ -0,0 +1,50 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-python>=12.0,<13.0a0 +- cuda-version=12.0 +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- doxygen>=1.8.20 +- gcc_linux-aarch64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- nccl>=2.9.9 +- ninja +- numpy>=1.21 +- numpydoc +- pre-commit +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==24.2.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-aarch64==2.17 +name: all_cuda-120_arch-aarch64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml new file mode 100644 index 000000000..335227994 --- /dev/null +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -0,0 +1,50 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-python>=12.0,<13.0a0 +- cuda-version=12.0 +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- doxygen>=1.8.20 +- gcc_linux-64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- nccl>=2.9.9 +- ninja +- numpy>=1.21 +- numpydoc +- pre-commit +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==24.2.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-64==2.17 +name: all_cuda-120_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml new file mode 100644 index 000000000..39c6d2488 --- /dev/null +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -0,0 +1,44 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-nvtx=11.8 +- cuda-profiler-api=11.8.86 +- cuda-version=11.8 +- cudatoolkit +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-aarch64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- nvcc_linux-aarch64=11.8 +- openblas +- pandas +- pyyaml +- rmm==24.2.* +- scikit-build>=0.13.1 +- sysroot_linux-aarch64==2.17 +name: bench_ann_cuda-118_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml new file mode 100644 index 000000000..e0f46085d --- /dev/null +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -0,0 +1,44 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-nvtx=11.8 +- cuda-profiler-api=11.8.86 +- cuda-version=11.8 +- cudatoolkit +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- nvcc_linux-64=11.8 +- openblas +- pandas +- pyyaml +- rmm==24.2.* +- scikit-build>=0.13.1 +- sysroot_linux-64==2.17 +name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml new file mode 100644 index 000000000..2c69bc532 --- /dev/null +++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml @@ -0,0 +1,40 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-version=12.0 +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-aarch64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- openblas +- pandas +- pyyaml +- rmm==24.2.* +- scikit-build>=0.13.1 +- sysroot_linux-aarch64==2.17 +name: bench_ann_cuda-120_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml new file mode 100644 index 000000000..4f4b0d501 --- /dev/null +++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml @@ -0,0 +1,40 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-version=12.0 +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- openblas +- pandas +- pyyaml +- rmm==24.2.* +- scikit-build>=0.13.1 +- sysroot_linux-64==2.17 +name: bench_ann_cuda-120_arch-x86_64 diff --git a/conda/recipes/cuda-ann-bench-cpu/build.sh b/conda/recipes/cuda-ann-bench-cpu/build.sh new file mode 100644 index 000000000..4462d5124 --- /dev/null +++ b/conda/recipes/cuda-ann-bench-cpu/build.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +./build.sh bench-ann --cpu-only --no-nvtx --build-metrics=bench_ann_cpu --incl-cache-stats +cmake --install cpp/build --component ann_bench diff --git a/conda/recipes/cuda-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/cuda-ann-bench-cpu/conda_build_config.yaml new file mode 100644 index 000000000..0bd424f85 --- /dev/null +++ b/conda/recipes/cuda-ann-bench-cpu/conda_build_config.yaml @@ -0,0 +1,20 @@ +c_compiler_version: + - 11 + +cxx_compiler_version: + - 11 + +sysroot_version: + - "2.17" + +cmake_version: + - ">=3.26.4" + +glog_version: + - ">=0.6.0" + +h5py_version: + - ">=3.8.0" + +nlohmann_json_version: + - ">=3.11.2" diff --git a/conda/recipes/cuda-ann-bench-cpu/meta.yaml b/conda/recipes/cuda-ann-bench-cpu/meta.yaml new file mode 100644 index 000000000..a19920256 --- /dev/null +++ b/conda/recipes/cuda-ann-bench-cpu/meta.yaml @@ -0,0 +1,66 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +# Usage: +# conda build . -c conda-forge -c nvidia -c rapidsai +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set py_version = environ['CONDA_PY'] %} +{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} + +package: + name: raft-ann-bench-cpu + version: {{ version }} + script: build.sh + +source: + path: ../../.. + +build: + script_env: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + - CMAKE_C_COMPILER_LAUNCHER + - CMAKE_CUDA_COMPILER_LAUNCHER + - CMAKE_CXX_COMPILER_LAUNCHER + - CMAKE_GENERATOR + - PARALLEL_LEVEL + - RAPIDS_ARTIFACTS_DIR + - SCCACHE_BUCKET + - SCCACHE_IDLE_TIMEOUT + - SCCACHE_REGION + - SCCACHE_S3_KEY_PREFIX=libcuvs-aarch64 # [aarch64] + - SCCACHE_S3_KEY_PREFIX=libcuvs-linux64 # [linux64] + - SCCACHE_S3_USE_SSL + number: {{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + +requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + - cmake {{ cmake_version }} + - ninja + - sysroot_{{ target_platform }} {{ sysroot_version }} + + host: + - glog {{ glog_version }} + - matplotlib + - nlohmann_json {{ nlohmann_json_version }} + - python + - pyyaml + - pandas + + run: + - glog {{ glog_version }} + - h5py {{ h5py_version }} + - matplotlib + - python + - pyyaml + - pandas + - benchmark +about: + home: https://rapids.ai/ + license: Apache-2.0 + summary: RAFT ANN CPU benchmarks diff --git a/conda/recipes/cuda-ann-bench/build.sh b/conda/recipes/cuda-ann-bench/build.sh new file mode 100644 index 000000000..00078792a --- /dev/null +++ b/conda/recipes/cuda-ann-bench/build.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +./build.sh bench-ann --allgpuarch --no-nvtx --build-metrics=bench_ann --incl-cache-stats +cmake --install cpp/build --component ann_bench diff --git a/conda/recipes/cuda-ann-bench/conda_build_config.yaml b/conda/recipes/cuda-ann-bench/conda_build_config.yaml new file mode 100644 index 000000000..da0b893c1 --- /dev/null +++ b/conda/recipes/cuda-ann-bench/conda_build_config.yaml @@ -0,0 +1,70 @@ +c_compiler_version: + - 11 + +cxx_compiler_version: + - 11 + +cuda_compiler: + - cuda-nvcc + +cuda11_compiler: + - nvcc + +sysroot_version: + - "2.17" + +cmake_version: + - ">=3.26.4" + +nccl_version: + - ">=2.9.9" + +gtest_version: + - ">=1.13.0" + +glog_version: + - ">=0.6.0" + +h5py_version: + - ">=3.8.0" + +nlohmann_json_version: + - ">=3.11.2" + +# The CTK libraries below are missing from the conda-forge::cudatoolkit package +# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages +# and the "*_run_*" version specifiers correspond to `11.x` packages. + +cuda11_libcublas_host_version: + - "=11.11.3.6" + +cuda11_libcublas_run_version: + - ">=11.5.2.43,<12.0.0" + +cuda11_libcurand_host_version: + - "=10.3.0.86" + +cuda11_libcurand_run_version: + - ">=10.2.5.43,<10.3.1" + +cuda11_libcusolver_host_version: + - "=11.4.1.48" + +cuda11_libcusolver_run_version: + - ">=11.2.0.43,<11.4.2" + +cuda11_libcusparse_host_version: + - "=11.7.5.86" + +cuda11_libcusparse_run_version: + - ">=11.6.0.43,<12.0.0" + +# `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all +# architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the +# "*_run_*" version specifiers correspond to `11.x` packages. + +cuda11_cuda_profiler_api_host_version: + - "=11.8.86" + +cuda11_cuda_profiler_api_run_version: + - ">=11.4.240,<12" diff --git a/conda/recipes/cuda-ann-bench/meta.yaml b/conda/recipes/cuda-ann-bench/meta.yaml new file mode 100644 index 000000000..a9e4d820a --- /dev/null +++ b/conda/recipes/cuda-ann-bench/meta.yaml @@ -0,0 +1,104 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +# Usage: +# conda build . -c conda-forge -c nvidia -c rapidsai +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set py_version = environ['CONDA_PY'] %} +{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} +{% set cuda_spec = ">=" + cuda_major ~ ",<" + (cuda_major | int + 1) ~ ".0a0" %} # i.e. >=11,<12.0a0 +{% set date_string = environ['RAPIDS_DATE_STRING'] %} + +package: + name: cuda-ann-bench + version: {{ version }} + script: build.sh + +source: + path: ../../.. + +build: + script_env: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + - CMAKE_C_COMPILER_LAUNCHER + - CMAKE_CUDA_COMPILER_LAUNCHER + - CMAKE_CXX_COMPILER_LAUNCHER + - CMAKE_GENERATOR + - PARALLEL_LEVEL + - RAPIDS_ARTIFACTS_DIR + - SCCACHE_BUCKET + - SCCACHE_IDLE_TIMEOUT + - SCCACHE_REGION + - SCCACHE_S3_KEY_PREFIX=libcuvs-aarch64 # [aarch64] + - SCCACHE_S3_KEY_PREFIX=libcuvs-linux64 # [linux64] + - SCCACHE_S3_USE_SSL + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} + +requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} + - cmake {{ cmake_version }} + - ninja + - sysroot_{{ target_platform }} {{ sysroot_version }} + + host: + - python + - libraft {{ version }} + - libcuvs {{ version }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + {% else %} + - cuda-profiler-api + - libcublas-dev + {% endif %} + - glog {{ glog_version }} + - nlohmann_json {{ nlohmann_json_version }} + - h5py {{ h5py_version }} + - benchmark + - matplotlib + - python + - pandas + - pyyaml + # rmm is needed to determine if package is gpu-enabled + - rmm ={{ minor_version }} + + run: + - python + - libraft {{ version }} + - libcuvs {{ version }} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} + - glog {{ glog_version }} + - h5py {{ h5py_version }} + - benchmark + - glog {{ glog_version }} + - matplotlib + - python + - pandas + - pyyaml + # rmm is needed to determine if package is gpu-enabled + - rmm ={{ minor_version }} +about: + home: https://rapids.ai/ + license: Apache-2.0 + summary: CUDA ANN GPU and CPU benchmarks diff --git a/conda/recipes/cuvs/build.sh b/conda/recipes/cuvs/build.sh new file mode 100644 index 000000000..26933c70c --- /dev/null +++ b/conda/recipes/cuvs/build.sh @@ -0,0 +1,5 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +#!/usr/bin/env bash + +# This assumes the script is executed from the root of the repo directory +./build.sh cuvs --no-nvtx diff --git a/conda/recipes/cuvs/conda_build_config.yaml b/conda/recipes/cuvs/conda_build_config.yaml new file mode 100644 index 000000000..e28b98da7 --- /dev/null +++ b/conda/recipes/cuvs/conda_build_config.yaml @@ -0,0 +1,17 @@ +c_compiler_version: + - 11 + +cxx_compiler_version: + - 11 + +cuda_compiler: + - cuda-nvcc + +cuda11_compiler: + - nvcc + +sysroot_version: + - "2.17" + +cmake_version: + - ">=3.26.4" diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml new file mode 100644 index 000000000..f22bd01d5 --- /dev/null +++ b/conda/recipes/cuvs/meta.yaml @@ -0,0 +1,76 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +# Usage: +# conda build . -c conda-forge -c numba -c rapidsai -c pytorch +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set py_version = environ['CONDA_PY'] %} +{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} + +package: + name: cuvs + version: {{ version }} + +source: + path: ../../.. + +build: + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} + +requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} + - cmake {{ cmake_version }} + - ninja + - sysroot_{{ target_platform }} {{ sysroot_version }} + host: + {% if cuda_major == "11" %} + - cuda-python >=11.7.1,<12.0a0 + - cudatoolkit + {% else %} + - cuda-python >=12.0,<13.0a0 + {% endif %} + - cuda-version ={{ cuda_version }} + - cython >=3.0.0 + - pylibraft {{ version }} + - libcuvs {{ version }} + - numpy >=1.21 + - python x.x + - rmm ={{ minor_version }} + - scikit-build >=0.13.1 + - setuptools + run: + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} + - pylibraft {{ version }} + - libcuvs {{ version }} + - python x.x + - rmm ={{ minor_version }} + +tests: + requirements: + - cuda-version ={{ cuda_version }} + imports: + - cuvs + +about: + home: https://rapids.ai/ + license: Apache-2.0 + # license_file: LICENSE + summary: cuvs python library diff --git a/conda/recipes/libcuvs/build_libcuvs.sh b/conda/recipes/libcuvs/build_libcuvs.sh new file mode 100644 index 000000000..ea0f64764 --- /dev/null +++ b/conda/recipes/libcuvs/build_libcuvs.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +./build.sh libcuvs --allgpuarch --compile-lib --build-metrics=compile_lib --incl-cache-stats --no-nvtx diff --git a/conda/recipes/libcuvs/build_libcuvs_static.sh b/conda/recipes/libcuvs/build_libcuvs_static.sh new file mode 100644 index 000000000..a107a9d6d --- /dev/null +++ b/conda/recipes/libcuvs/build_libcuvs_static.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +./build.sh libcuvs --allgpuarch --compile-static-lib --build-metrics=compile_lib_static --incl-cache-stats --no-nvtx -n +cmake --install cpp/build --component compiled-static diff --git a/conda/recipes/libcuvs/build_libcuvs_template.sh b/conda/recipes/libcuvs/build_libcuvs_template.sh new file mode 100644 index 000000000..bd7719af7 --- /dev/null +++ b/conda/recipes/libcuvs/build_libcuvs_template.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +# Just building template so we verify it uses libraft.so and fail if it doesn't build +./build.sh template diff --git a/conda/recipes/libcuvs/build_libraft_tests.sh b/conda/recipes/libcuvs/build_libraft_tests.sh new file mode 100644 index 000000000..f1f6567fb --- /dev/null +++ b/conda/recipes/libcuvs/build_libraft_tests.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +./build.sh tests bench --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats +cmake --install cpp/build --component testing diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml new file mode 100644 index 000000000..d156f2609 --- /dev/null +++ b/conda/recipes/libcuvs/conda_build_config.yaml @@ -0,0 +1,73 @@ +c_compiler_version: + - 11 + +cxx_compiler_version: + - 11 + +cuda_compiler: + - cuda-nvcc + +cuda11_compiler: + - nvcc + +sysroot_version: + - "2.17" + +cmake_version: + - ">=3.26.4" + +nccl_version: + - ">=2.9.9" + +gtest_version: + - ">=1.13.0" + +glog_version: + - ">=0.6.0" + +faiss_version: + - ">=1.7.1" + +h5py_version: + - ">=3.8.0" + +nlohmann_json_version: + - ">=3.11.2" + +# The CTK libraries below are missing from the conda-forge::cudatoolkit package +# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages +# and the "*_run_*" version specifiers correspond to `11.x` packages. + +cuda11_libcublas_host_version: + - "=11.11.3.6" + +cuda11_libcublas_run_version: + - ">=11.5.2.43,<12.0.0" + +cuda11_libcurand_host_version: + - "=10.3.0.86" + +cuda11_libcurand_run_version: + - ">=10.2.5.43,<10.3.1" + +cuda11_libcusolver_host_version: + - "=11.4.1.48" + +cuda11_libcusolver_run_version: + - ">=11.2.0.43,<11.4.2" + +cuda11_libcusparse_host_version: + - "=11.7.5.86" + +cuda11_libcusparse_run_version: + - ">=11.6.0.43,<12.0.0" + +# `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all +# architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the +# "*_run_*" version specifiers correspond to `11.x` packages. + +cuda11_cuda_profiler_api_host_version: + - "=11.8.86" + +cuda11_cuda_profiler_api_run_version: + - ">=11.4.240,<12" diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml new file mode 100644 index 000000000..1ff2e190f --- /dev/null +++ b/conda/recipes/libcuvs/meta.yaml @@ -0,0 +1,173 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +# Usage: +# conda build . -c conda-forge -c nvidia -c rapidsai +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} +{% set cuda_spec = ">=" + cuda_major ~ ",<" + (cuda_major | int + 1) ~ ".0a0" %} # i.e. >=11,<12.0a0 +{% set date_string = environ['RAPIDS_DATE_STRING'] %} + +package: + name: libcuvs-split + +source: + path: ../../.. + +outputs: + - name: libcuvs-static + version: {{ version }} + script: build_libcuvs_static.sh + build: + script_env: *script_env + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} + requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} + - cmake {{ cmake_version }} + - ninja + - sysroot_{{ target_platform }} {{ sysroot_version }} + host: + - {{ pin_subpackage('libraft-headers', exact=True) }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + - libcurand {{ cuda11_libcurand_host_version }} + - libcurand-dev {{ cuda11_libcurand_host_version }} + - libcusolver {{ cuda11_libcusolver_host_version }} + - libcusolver-dev {{ cuda11_libcusolver_host_version }} + - libcusparse {{ cuda11_libcusparse_host_version }} + - libcusparse-dev {{ cuda11_libcusparse_host_version }} + {% else %} + - cuda-profiler-api + - libcublas-dev + - libcurand-dev + - libcusolver-dev + - libcusparse-dev + {% endif %} + run: + - {{ pin_subpackage('libraft-headers', exact=True) }} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + about: + home: https://rapids.ai/ + license: Apache-2.0 + summary: libcuvs static library + - name: libcuvs-tests + version: {{ version }} + script: build_libcuvs_tests.sh + build: + script_env: *script_env + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} + requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} + - cmake {{ cmake_version }} + - ninja + - sysroot_{{ target_platform }} {{ sysroot_version }} + host: + - {{ pin_subpackage('libraft-headers', exact=True) }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + - libcurand {{ cuda11_libcurand_host_version }} + - libcurand-dev {{ cuda11_libcurand_host_version }} + - libcusolver {{ cuda11_libcusolver_host_version }} + - libcusolver-dev {{ cuda11_libcusolver_host_version }} + - libcusparse {{ cuda11_libcusparse_host_version }} + - libcusparse-dev {{ cuda11_libcusparse_host_version }} + {% else %} + - cuda-cudart-dev + - cuda-profiler-api + - libcublas-dev + - libcurand-dev + - libcusolver-dev + - libcusparse-dev + {% endif %} + - gmock {{ gtest_version }} + - gtest {{ gtest_version }} + run: + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} + - {{ pin_subpackage('libraft-headers', exact=True) }} + - gmock {{ gtest_version }} + - gtest {{ gtest_version }} + about: + home: https://rapids.ai/ + license: Apache-2.0 + summary: libcuvs tests + - name: libcuvs-template + version: {{ version }} + script: build_libcuvs_template.sh + build: + script_env: *script_env + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} + requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} + - cmake {{ cmake_version }} + - ninja + - sysroot_{{ target_platform }} {{ sysroot_version }} + host: + - {{ pin_subpackage('libraft-headers', exact=True) }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + {% else %} + - cuda-profiler-api + - libcublas-dev + {% endif %} + run: + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} + - {{ pin_subpackage('libraft-headers', exact=True) }} + about: + home: https://rapids.ai/ + license: Apache-2.0 + summary: libcuvs template diff --git a/cpp/.clang-format b/cpp/.clang-format new file mode 100644 index 000000000..18f376d66 --- /dev/null +++ b/cpp/.clang-format @@ -0,0 +1,155 @@ +--- +# Refer to the following link for the explanation of each params: +# http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: true +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false +# This is deprecated +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + # disabling the below splits, else, they'll just add to the vertical length of source files! + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakAfterJavaFieldAnnotations: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: WebKit +BreakBeforeInheritanceComma: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform +ConstructorInitializerIndentWidth: 2 +ContinuationIndentWidth: 2 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +# Enabling comment reflow causes doxygen comments to be messed up in their formats! +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: c++17 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +# Be consistent with indent-width, even for people who use tab for indentation! +TabWidth: 2 +UseTab: Never diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy new file mode 100644 index 000000000..b4a7d35d9 --- /dev/null +++ b/cpp/.clang-tidy @@ -0,0 +1,229 @@ +--- +# Refer to the following link for the explanation of each params: +# https://releases.llvm.org/8.0.1/tools/clang/tools/extra/docs/clang-tidy/checks/list.html +Checks: 'clang-diagnostic-*,clang-analyzer-*,modernize-*,-modernize-make-*,-modernize-raw-string-literal,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming,-*,modernize-*,-modernize-make-*,-modernize-raw-string-literal,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming' +WarningsAsErrors: '' +HeaderFilterRegex: '' +AnalyzeTemporaryDtors: false +FormatStyle: none +User: snanditale +CheckOptions: + - key: google-build-namespaces.HeaderFileExtensions + value: ',h,hh,hpp,hxx' + - key: google-global-names-in-headers.HeaderFileExtensions + value: ',h,hh,hpp,hxx' + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: google-readability-function-size.BranchThreshold + value: '4294967295' + - key: google-readability-function-size.LineThreshold + value: '4294967295' + - key: google-readability-function-size.NestingThreshold + value: '4294967295' + - key: google-readability-function-size.ParameterThreshold + value: '4294967295' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-function-size.VariableThreshold + value: '4294967295' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' + - key: google-runtime-int.SignedTypePrefix + value: int + - key: google-runtime-int.TypeSuffix + value: '' + - key: google-runtime-int.UnsignedTypePrefix + value: uint + - key: google-runtime-references.WhiteListTypes + value: '' + - key: modernize-loop-convert.MaxCopySize + value: '16' + - key: modernize-loop-convert.MinConfidence + value: reasonable + - key: modernize-loop-convert.NamingStyle + value: CamelCase + - key: modernize-pass-by-value.IncludeStyle + value: llvm + - key: modernize-pass-by-value.ValuesOnly + value: '0' + - key: modernize-replace-auto-ptr.IncludeStyle + value: llvm + - key: modernize-replace-random-shuffle.IncludeStyle + value: llvm + - key: modernize-use-auto.MinTypeNameLength + value: '5' + - key: modernize-use-auto.RemoveStars + value: '0' + - key: modernize-use-default-member-init.IgnoreMacros + value: '1' + - key: modernize-use-default-member-init.UseAssignment + value: '0' + - key: modernize-use-emplace.ContainersWithPushBack + value: '::std::vector;::std::list;::std::deque' + - key: modernize-use-emplace.SmartPointers + value: '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr' + - key: modernize-use-emplace.TupleMakeFunctions + value: '::std::make_pair;::std::make_tuple' + - key: modernize-use-emplace.TupleTypes + value: '::std::pair;::std::tuple' + - key: modernize-use-equals-default.IgnoreMacros + value: '1' + - key: modernize-use-noexcept.ReplacementString + value: '' + - key: modernize-use-noexcept.UseNoexceptFalse + value: '1' + - key: modernize-use-nullptr.NullMacros + value: 'NULL' + - key: modernize-use-transparent-functors.SafeMode + value: '0' + - key: modernize-use-using.IgnoreMacros + value: '1' + - key: readability-identifier-naming.AbstractClassCase + value: lower_case + - key: readability-identifier-naming.AbstractClassPrefix + value: '' + - key: readability-identifier-naming.AbstractClassSuffix + value: '' + - key: readability-identifier-naming.ClassCase + value: lower_case + - key: readability-identifier-naming.ClassPrefix + value: '' + - key: readability-identifier-naming.ClassSuffix + value: '' + - key: readability-identifier-naming.ClassConstantCase + value: CamelCase + - key: readability-identifier-naming.ClassConstantPrefix + value: 'k' + - key: readability-identifier-naming.ClassConstantSuffix + value: '' + - key: readability-identifier-naming.ClassMemberCase + value: lower_case + - key: readability-identifier-naming.ClassMemberPrefix + value: '' + - key: readability-identifier-naming.ClassMemberSuffix + value: '_' + - key: readability-identifier-naming.ClassMethodCase + value: lower_case + - key: readability-identifier-naming.ClassMethodPrefix + value: '' + - key: readability-identifier-naming.ClassMethodSuffix + value: '' + - key: readability-identifier-naming.ConstexprFunctionCase + value: lower_case + - key: readability-identifier-naming.ConstexprFunctionPrefix + value: '' + - key: readability-identifier-naming.ConstexprFunctionSuffix + value: '' + - key: readability-identifier-naming.ConstexprMethodCase + value: lower_case + - key: readability-identifier-naming.ConstexprMethodPrefix + value: '' + - key: readability-identifier-naming.ConstexprMethodSuffix + value: '' + - key: readability-identifier-naming.ConstexprVariableCase + value: CamelCase + - key: readability-identifier-naming.ConstexprVariablePrefix + value: 'k' + - key: readability-identifier-naming.ConstexprVariableSuffix + value: '' + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.EnumPrefix + value: '' + - key: readability-identifier-naming.EnumSuffix + value: '' + - key: readability-identifier-naming.EnumConstantCase + value: CamelCase + - key: readability-identifier-naming.EnumConstantPrefix + value: 'k' + - key: readability-identifier-naming.EnumConstantSuffix + value: '' + - key: readability-identifier-naming.FunctionCase + value: lower_case + - key: readability-identifier-naming.FunctionPrefix + value: '' + - key: readability-identifier-naming.FunctionSuffix + value: '' + - key: readability-identifier-naming.GlobalConstantCase + value: CamelCase + - key: readability-identifier-naming.GlobalConstantPrefix + value: 'k' + - key: readability-identifier-naming.GlobalConstantSuffix + value: '' + - key: readability-identifier-naming.IgnoreFailedSplit + value: '0' + - key: readability-identifier-naming.LocalVariableCase + value: 'lower_case' + - key: readability-identifier-naming.LocalVariablePrefix + value: '' + - key: readability-identifier-naming.LocalVariableSuffix + value: '' + - key: readability-identifier-naming.ConstExprVariableCase + value: 'CamelCase' + - key: readability-identifier-naming.ConstExprVariablePrefix + value: 'k' + - key: readability-identifier-naming.ConstExprVariableSuffix + value: '' + - key: readability-identifier-naming.MemberCase + value: lower_case + - key: readability-identifier-naming.MemberPrefix + value: '' + - key: readability-identifier-naming.MemberSuffix + value: '' + - key: readability-identifier-naming.NamespaceCase + value: lower_case + - key: readability-identifier-naming.NamespacePrefix + value: '' + - key: readability-identifier-naming.NamespaceSuffix + value: '' + - key: readability-identifier-naming.PrivateMemberCase + value: lower_case + - key: readability-identifier-naming.PrivateMemberPrefix + value: '' + - key: readability-identifier-naming.PrivateMemberSuffix + value: '_' + - key: readability-identifier-naming.ProtectedMemberCase + value: lower_case + - key: readability-identifier-naming.ProtectedMemberPrefix + value: '' + - key: readability-identifier-naming.ProtectedMemberSuffix + value: '_' + - key: readability-identifier-naming.StaticConstantCase + value: CamelCase + - key: readability-identifier-naming.StaticConstantPrefix + value: 'k' + - key: readability-identifier-naming.StaticConstantSuffix + value: '' + - key: readability-identifier-naming.StructCase + value: lower_case + - key: readability-identifier-naming.StructPrefix + value: '' + - key: readability-identifier-naming.StructSuffix + value: '' + - key: readability-identifier-naming.TypeAliasCase + value: lower_case + - key: readability-identifier-naming.TypeAliasPrefix + value: '' + - key: readability-identifier-naming.TypeAliasSuffix + value: '' + - key: readability-identifier-naming.TypeTemplateParameterCase + value: CamelCase + - key: readability-identifier-naming.TypeTemplateParameterPrefix + value: '' + - key: readability-identifier-naming.TypeTemplateParameterSuffix + value: '' + - key: readability-identifier-naming.TypedefCase + value: lower_case + - key: readability-identifier-naming.TypedefPrefix + value: '' + - key: readability-identifier-naming.TypedefSuffix + value: '' + - key: readability-identifier-naming.VariableCase + value: lower_case + - key: readability-identifier-naming.VariablePrefix + value: '' + - key: readability-identifier-naming.VariableSuffix + value: '' +... diff --git a/cpp/.clangd b/cpp/.clangd new file mode 100644 index 000000000..7c4fe036d --- /dev/null +++ b/cpp/.clangd @@ -0,0 +1,65 @@ +# https://clangd.llvm.org/config + +# Apply a config conditionally to all C files +If: + PathMatch: .*\.(c|h)$ + +--- + +# Apply a config conditionally to all C++ files +If: + PathMatch: .*\.(c|h)pp + +--- + +# Apply a config conditionally to all CUDA files +If: + PathMatch: .*\.cuh? +CompileFlags: + Add: + - "-x" + - "cuda" + # No error on unknown CUDA versions + - "-Wno-unknown-cuda-version" + # Allow variadic CUDA functions + - "-Xclang=-fcuda-allow-variadic-functions" +Diagnostics: + Suppress: + - "variadic_device_fn" + - "attributes_not_allowed" + +--- + +# Tweak the clangd parse settings for all files +CompileFlags: + Add: + # report all errors + - "-ferror-limit=0" + - "-fmacro-backtrace-limit=0" + - "-ftemplate-backtrace-limit=0" + # Skip the CUDA version check + - "--no-cuda-version-check" + Remove: + # remove gcc's -fcoroutines + - -fcoroutines + # remove nvc++ flags unknown to clang + - "-gpu=*" + - "-stdpar*" + # remove nvcc flags unknown to clang + - "-arch*" + - "-gencode*" + - "--generate-code*" + - "-ccbin*" + - "-t=*" + - "--threads*" + - "-Xptxas*" + - "-Xcudafe*" + - "-Xfatbin*" + - "-Xcompiler*" + - "--diag-suppress*" + - "--diag_suppress*" + - "--compiler-options*" + - "--expt-extended-lambda" + - "--expt-relaxed-constexpr" + - "-forward-unknown-to-host-compiler" + - "-Werror=cross-execution-space-call" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt new file mode 100644 index 000000000..acb77ec8c --- /dev/null +++ b/cpp/CMakeLists.txt @@ -0,0 +1,741 @@ +# ============================================================================= +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +set(RAPIDS_VERSION "24.02") +set(RAFT_VERSION "24.02.00") + +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) +include(../fetch_rapids.cmake) +include(rapids-cmake) +include(rapids-cpm) +include(rapids-export) +include(rapids-find) + +option(BUILD_CPU_ONLY "Build CPU only components. Applies to RAFT ANN benchmarks currently" OFF) + +# workaround for rapids_cuda_init_architectures not working for arch detection with +# enable_language(CUDA) +set(lang_list "CXX") + +if(NOT BUILD_CPU_ONLY) + include(rapids-cuda) + rapids_cuda_init_architectures(RAFT) + list(APPEND lang_list "CUDA") +endif() + +project( + RAFT + VERSION ${RAFT_VERSION} + LANGUAGES ${lang_list} +) + +# Write the version header +rapids_cmake_write_version_file(include/raft/version_config.hpp) + +# ################################################################################################## +# * build type --------------------------------------------------------------- + +# Set a default build type if none was specified +rapids_cmake_build_type(Release) + +# this is needed for clang-tidy runs +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# ################################################################################################## +# * User Options ------------------------------------------------------------ + +option(BUILD_SHARED_LIBS "Build raft shared libraries" ON) +option(BUILD_TESTS "Build raft unit-tests" ON) +option(BUILD_PRIMS_BENCH "Build raft C++ benchmark tests" OFF) +option(BUILD_ANN_BENCH "Build raft ann benchmarks" OFF) +option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF) +option(CUDA_ENABLE_LINEINFO + "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF +) +option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF) +option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF) +option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) +option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) +option(DISABLE_OPENMP "Disable OpenMP" OFF) +option(RAFT_NVTX "Enable nvtx markers" OFF) + +set(RAFT_COMPILE_LIBRARY_DEFAULT OFF) +if((BUILD_TESTS + OR BUILD_PRIMS_BENCH + OR BUILD_ANN_BENCH + ) + AND NOT BUILD_CPU_ONLY +) + set(RAFT_COMPILE_LIBRARY_DEFAULT ON) +endif() +option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations" + ${RAFT_COMPILE_LIBRARY_DEFAULT} +) + +if(BUILD_CPU_ONLY) + set(BUILD_SHARED_LIBS OFF) + set(BUILD_TESTS OFF) +endif() + +# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to +# have different values for the `Threads::Threads` target. Setting this flag ensures +# `Threads::Threads` is the same value across all builds so that cache hits occur +set(THREADS_PREFER_PTHREAD_FLAG ON) + +include(CMakeDependentOption) +# cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for +# nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF ) + +message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}") +message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}") +message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_PRIMS_BENCH}") +message(VERBOSE "RAFT: Building ANN benchmarks: ${BUILD_ANN_BENCH}") +message(VERBOSE "RAFT: Build CPU only components: ${BUILD_CPU_ONLY}") +message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}") +message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS}) +message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}") +message(VERBOSE "RAFT: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}") +message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}") +message(VERBOSE "RAFT: Enable nvtx markers: ${RAFT_NVTX}") +message(VERBOSE + "RAFT: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}" +) + +# Set RMM logging level +set(RMM_LOGGING_LEVEL + "INFO" + CACHE STRING "Choose the logging level." +) +set_property( + CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF" +) +message(VERBOSE "RAFT: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.") + +# ################################################################################################## +# * Conda environment detection ---------------------------------------------- + +if(DETECT_CONDA_ENV) + rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) + if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND DEFINED ENV{CONDA_PREFIX}) + message( + STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}" + ) + set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}") + endif() +endif() + +# ################################################################################################## +# * compiler options ---------------------------------------------------------- + +set(_ctk_static_suffix "") +if(CUDA_STATIC_RUNTIME) + set(_ctk_static_suffix "_static") +endif() + +if(NOT BUILD_CPU_ONLY) + # CUDA runtime + rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME}) + # * find CUDAToolkit package + # * determine GPU architectures + # * enable the CMake CUDA language + # * set other CUDA compilation flags + rapids_find_package( + CUDAToolkit REQUIRED + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports + ) +else() + add_compile_definitions(BUILD_CPU_ONLY) +endif() + +if(NOT DISABLE_OPENMP) + rapids_find_package( + OpenMP REQUIRED + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports + ) + if(OPENMP_FOUND) + message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") + endif() +endif() + +include(cmake/modules/ConfigureCUDA.cmake) + +# ################################################################################################## +# * Requirements ------------------------------------------------------------- + +# add third party dependencies using CPM +rapids_cpm_init() + +if(NOT BUILD_CPU_ONLY) + # thrust before rmm/cuco so we get the right version of thrust/cub + include(cmake/thirdparty/get_thrust.cmake) + include(cmake/thirdparty/get_rmm.cmake) + include(cmake/thirdparty/get_cutlass.cmake) + + include(${rapids-cmake-dir}/cpm/cuco.cmake) + rapids_cpm_cuco(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports) +endif() + +if(BUILD_TESTS) + include(cmake/thirdparty/get_gtest.cmake) +endif() + +if(BUILD_PRIMS_BENCH OR BUILD_ANN_BENCH) + include(${rapids-cmake-dir}/cpm/gbench.cmake) + rapids_cpm_gbench() +endif() + +# ################################################################################################## +# * raft --------------------------------------------------------------------- +add_library(raft INTERFACE) +add_library(raft::raft ALIAS raft) + +target_include_directories( + raft INTERFACE "$" "$" +) + +if(NOT BUILD_CPU_ONLY) + # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target. + target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass raft::Thrust) +endif() + +target_compile_features(raft INTERFACE cxx_std_17 $) +target_compile_options( + raft INTERFACE $<$:--expt-extended-lambda + --expt-relaxed-constexpr> +) + +set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix}) +set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix}) +set(RAFT_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix}) +set(RAFT_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix}) + +set(RAFT_CTK_MATH_DEPENDENCIES ${RAFT_CUBLAS_DEPENDENCY} ${RAFT_CUSOLVER_DEPENDENCY} + ${RAFT_CUSPARSE_DEPENDENCY} ${RAFT_CURAND_DEPENDENCY} +) + +# Endian detection +include(TestBigEndian) +test_big_endian(BIG_ENDIAN) +if(BIG_ENDIAN) + target_compile_definitions(raft INTERFACE RAFT_SYSTEM_LITTLE_ENDIAN=0) +else() + target_compile_definitions(raft INTERFACE RAFT_SYSTEM_LITTLE_ENDIAN=1) +endif() + +if(RAFT_COMPILE_LIBRARY) + file( + WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld" + [=[ +SECTIONS +{ +.nvFatBinSegment : { *(.nvFatBinSegment) } +.nv_fatbin : { *(.nv_fatbin) } +} +]=] + ) +endif() + +# ################################################################################################## +# * NVTX support in raft ----------------------------------------------------- + +if(RAFT_NVTX) + # This enables NVTX within the project with no option to disable it downstream. + target_link_libraries(raft INTERFACE CUDA::nvToolsExt) + target_compile_definitions(raft INTERFACE NVTX_ENABLED) +else() + # Allow enable NVTX downstream if not set here. This creates a new option at build/install time, + # which is set by default to OFF, but can be enabled in the dependent project. + get_property( + nvtx_option_help_string + CACHE RAFT_NVTX + PROPERTY HELPSTRING + ) + string( + CONCAT + nvtx_export_string + "option(RAFT_NVTX \"" + ${nvtx_option_help_string} + "\" OFF)" + [=[ + +target_link_libraries(raft::raft INTERFACE $<$:CUDA::nvToolsExt>) +target_compile_definitions(raft::raft INTERFACE $<$:NVTX_ENABLED>) + + ]=] + ) +endif() + +# ################################################################################################## +# * raft_compiled ------------------------------------------------------------ +add_library(raft_compiled INTERFACE) + +if(TARGET raft_compiled AND (NOT TARGET raft::compiled)) + add_library(raft::compiled ALIAS raft_compiled) +endif() + +set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled) + +if(RAFT_COMPILE_LIBRARY) + add_library( + raft_objs OBJECT + src/core/logger.cpp + src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_rbf.cu + src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu + src/distance/distance.cu + src/distance/fused_l2_nn.cu + src/linalg/detail/coalesced_reduction.cu + src/matrix/detail/select_k_double_int64_t.cu + src/matrix/detail/select_k_double_uint32_t.cu + src/matrix/detail/select_k_float_int64_t.cu + src/matrix/detail/select_k_float_uint32_t.cu + src/matrix/detail/select_k_float_int32.cu + src/matrix/detail/select_k_half_int64_t.cu + src/matrix/detail/select_k_half_uint32_t.cu + src/neighbors/ball_cover.cu + src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu + src/neighbors/brute_force_knn_int64_t_float_int64_t.cu + src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu + src/neighbors/brute_force_knn_int_float_int.cu + src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu + src/neighbors/brute_force_knn_index_float.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu + src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu + src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu + src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu + src/neighbors/detail/ivf_flat_search.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu + src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu + src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu + src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu + src/neighbors/detail/refine_host_float_float.cpp + src/neighbors/detail/refine_host_int8_t_float.cpp + src/neighbors/detail/refine_host_uint8_t_float.cpp + src/neighbors/detail/selection_faiss_int32_t_float.cu + src/neighbors/detail/selection_faiss_int_double.cu + src/neighbors/detail/selection_faiss_long_float.cu + src/neighbors/detail/selection_faiss_size_t_double.cu + src/neighbors/detail/selection_faiss_size_t_float.cu + src/neighbors/detail/selection_faiss_uint32_t_float.cu + src/neighbors/detail/selection_faiss_int64_t_double.cu + src/neighbors/detail/selection_faiss_int64_t_half.cu + src/neighbors/detail/selection_faiss_uint32_t_double.cu + src/neighbors/detail/selection_faiss_uint32_t_half.cu + src/neighbors/ivf_flat_build_float_int64_t.cu + src/neighbors/ivf_flat_build_int8_t_int64_t.cu + src/neighbors/ivf_flat_build_uint8_t_int64_t.cu + src/neighbors/ivf_flat_extend_float_int64_t.cu + src/neighbors/ivf_flat_extend_int8_t_int64_t.cu + src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu + src/neighbors/ivf_flat_search_float_int64_t.cu + src/neighbors/ivf_flat_search_int8_t_int64_t.cu + src/neighbors/ivf_flat_search_uint8_t_int64_t.cu + src/neighbors/ivfpq_build_float_int64_t.cu + src/neighbors/ivfpq_build_int8_t_int64_t.cu + src/neighbors/ivfpq_build_uint8_t_int64_t.cu + src/neighbors/ivfpq_extend_float_int64_t.cu + src/neighbors/ivfpq_extend_int8_t_int64_t.cu + src/neighbors/ivfpq_extend_uint8_t_int64_t.cu + src/neighbors/ivfpq_search_float_int64_t.cu + src/neighbors/ivfpq_search_int8_t_int64_t.cu + src/neighbors/ivfpq_search_uint8_t_int64_t.cu + src/neighbors/refine_float_float.cu + src/neighbors/refine_int8_t_float.cu + src/neighbors/refine_uint8_t_float.cu + src/raft_runtime/cluster/cluster_cost.cuh + src/raft_runtime/cluster/cluster_cost_double.cu + src/raft_runtime/cluster/cluster_cost_float.cu + src/raft_runtime/cluster/kmeans_fit_double.cu + src/raft_runtime/cluster/kmeans_fit_float.cu + src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu + src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu + src/raft_runtime/cluster/update_centroids.cuh + src/raft_runtime/cluster/update_centroids_double.cu + src/raft_runtime/cluster/update_centroids_float.cu + src/raft_runtime/distance/fused_l2_min_arg.cu + src/raft_runtime/distance/pairwise_distance.cu + src/raft_runtime/matrix/select_k_float_int64_t.cu + src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu + src/raft_runtime/neighbors/cagra_build.cu + src/raft_runtime/neighbors/cagra_search.cu + src/raft_runtime/neighbors/cagra_serialize.cu + src/raft_runtime/neighbors/ivf_flat_build.cu + src/raft_runtime/neighbors/ivf_flat_search.cu + src/raft_runtime/neighbors/ivf_flat_serialize.cu + src/raft_runtime/neighbors/ivfpq_build.cu + src/raft_runtime/neighbors/ivfpq_deserialize.cu + src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu + src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu + src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu + src/raft_runtime/neighbors/ivfpq_serialize.cu + src/raft_runtime/neighbors/refine_d_int64_t_float.cu + src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu + src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu + src/raft_runtime/neighbors/refine_h_int64_t_float.cu + src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu + src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu + src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu + src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu + src/raft_runtime/random/rmat_rectangular_generator_int_double.cu + src/raft_runtime/random/rmat_rectangular_generator_int_float.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu + src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu + src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu + src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu + src/util/memory_pool.cpp + ) + set_target_properties( + raft_objs + PROPERTIES CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + ) + + target_compile_definitions(raft_objs PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") + target_compile_options( + raft_objs PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) + + add_library(raft_lib SHARED $) + add_library(raft_lib_static STATIC $) + + set_target_properties( + raft_lib raft_lib_static + PROPERTIES OUTPUT_NAME raft + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + INTERFACE_POSITION_INDEPENDENT_CODE ON + ) + + foreach(target raft_lib raft_lib_static raft_objs) + target_link_libraries( + ${target} + PUBLIC raft::raft + ${RAFT_CTK_MATH_DEPENDENCIES} # TODO: Once `raft::resources` is used everywhere, this + # will just be cublas + $ + ) + + # So consumers know when using libraft.so/libraft.a + target_compile_definitions(${target} PUBLIC "RAFT_COMPILED") + # ensure CUDA symbols aren't relocated to the middle of the debug build binaries + target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") + endforeach() +endif() + +if(TARGET raft_lib AND (NOT TARGET raft::raft_lib)) + add_library(raft::raft_lib ALIAS raft_lib) +endif() + +target_link_libraries(raft_compiled INTERFACE raft::raft $) + +# ################################################################################################## +# * raft_compiled_static---------------------------------------------------------------------------- + +add_library(raft_compiled_static INTERFACE) + +if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static)) + add_library(raft::compiled_static ALIAS raft_compiled_static) +endif() +set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static) + +if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static)) + add_library(raft::raft_lib_static ALIAS raft_lib_static) +endif() + +target_link_libraries( + raft_compiled_static INTERFACE raft::raft $ +) + +# ################################################################################################## +# * raft_distributed ------------------------------------------------------------------------------- +add_library(raft_distributed INTERFACE) + +if(TARGET raft_distributed AND (NOT TARGET raft::distributed)) + add_library(raft::distributed ALIAS raft_distributed) +endif() + +set_target_properties(raft_distributed PROPERTIES EXPORT_NAME distributed) + +rapids_find_generate_module( + NCCL + HEADER_NAMES nccl.h + LIBRARY_NAMES nccl + BUILD_EXPORT_SET raft-distributed-exports + INSTALL_EXPORT_SET raft-distributed-exports +) + +rapids_export_package(BUILD ucx raft-distributed-exports) +rapids_export_package(INSTALL ucx raft-distributed-exports) +rapids_export_package(BUILD NCCL raft-distributed-exports) +rapids_export_package(INSTALL NCCL raft-distributed-exports) + +target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL) + +# ################################################################################################## +# * install targets----------------------------------------------------------- +rapids_cmake_install_lib_dir(lib_dir) +include(GNUInstallDirs) +include(CPack) + +install( + TARGETS raft + DESTINATION ${lib_dir} + COMPONENT raft + EXPORT raft-exports +) + +install( + TARGETS raft_compiled raft_compiled_static + DESTINATION ${lib_dir} + COMPONENT raft + EXPORT raft-compiled-exports +) + +if(TARGET raft_lib) + install( + TARGETS raft_lib + DESTINATION ${lib_dir} + COMPONENT compiled + EXPORT raft-compiled-lib-exports + ) + install( + TARGETS raft_lib_static + DESTINATION ${lib_dir} + COMPONENT compiled-static + EXPORT raft-compiled-static-lib-exports + ) + install( + DIRECTORY include/raft_runtime + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + COMPONENT compiled + ) +endif() + +install( + TARGETS raft_distributed + DESTINATION ${lib_dir} + COMPONENT distributed + EXPORT raft-distributed-exports +) + +install( + DIRECTORY include/raft + COMPONENT raft + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +# Temporary install of raft.hpp while the file is removed +install( + FILES include/raft.hpp + COMPONENT raft + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft +) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/include/raft/version_config.hpp + COMPONENT raft + DESTINATION include/raft +) + +# ################################################################################################## +# * install export ----------------------------------------------------------- +set(doc_string + [=[ +Provide targets for the RAFT: Reusable Accelerated Functions and Tools + +RAFT contains fundamental widely-used algorithms and primitives +for data science and machine learning. + +Optional Components: + - compiled + - compiled_static + - distributed + +Imported Targets: + - raft::raft + - raft::compiled brought in by the `compiled` optional component + - raft::compiled_static brought in by the `compiled_static` optional component + - raft::distributed brought in by the `distributed` optional component + +]=] +) + +set(code_string ${nvtx_export_string}) + +string( + APPEND + code_string + [=[ +if(NOT TARGET raft::Thrust) + thrust_create_target(raft::Thrust FROM_OPTIONS) +endif() +]=] +) + +string( + APPEND + code_string + [=[ +if(compiled IN_LIST raft_FIND_COMPONENTS) + enable_language(CUDA) +endif() +]=] +) +set(raft_components compiled distributed) +set(raft_export_sets raft-compiled-exports raft-distributed-exports) +if(TARGET raft_lib) + list(APPEND raft_components compiled compiled-static) + list(APPEND raft_export_sets raft-compiled-lib-exports raft-compiled-static-lib-exports) +endif() + +string( + APPEND + code_string + [=[ + option(RAFT_ENABLE_CUSOLVER_DEPENDENCY "Enable cusolver dependency" ON) + option(RAFT_ENABLE_CUBLAS_DEPENDENCY "Enable cublas dependency" ON) + option(RAFT_ENABLE_CURAND_DEPENDENCY "Enable curand dependency" ON) + option(RAFT_ENABLE_CUSPARSE_DEPENDENCY "Enable cusparse dependency" ON) + +mark_as_advanced(RAFT_ENABLE_CUSOLVER_DEPENDENCY) +mark_as_advanced(RAFT_ENABLE_CUBLAS_DEPENDENCY) +mark_as_advanced(RAFT_ENABLE_CURAND_DEPENDENCY) +mark_as_advanced(RAFT_ENABLE_CUSPARSE_DEPENDENCY) + +target_link_libraries(raft::raft INTERFACE + $<$:${RAFT_CUSOLVER_DEPENDENCY}> + $<$:${RAFT_CUBLAS_DEPENDENCY}> + $<$:${RAFT_CUSPARSE_DEPENDENCY}> + $<$:${RAFT_CURAND_DEPENDENCY}> +) +]=] +) + +# Use `rapids_export` for 22.04 as it will have COMPONENT support +rapids_export( + INSTALL raft + EXPORT_SET raft-exports + COMPONENTS ${raft_components} + COMPONENTS_EXPORT_SET ${raft_export_sets} + GLOBAL_TARGETS raft compiled distributed + NAMESPACE raft:: + DOCUMENTATION doc_string + FINAL_CODE_BLOCK code_string +) + +# ################################################################################################## +# * build export ------------------------------------------------------------- +rapids_export( + BUILD raft + EXPORT_SET raft-exports + COMPONENTS ${raft_components} + COMPONENTS_EXPORT_SET ${raft_export_sets} + GLOBAL_TARGETS raft compiled distributed + DOCUMENTATION doc_string + NAMESPACE raft:: + FINAL_CODE_BLOCK code_string +) + +# ################################################################################################## +# * shared test/bench headers ------------------------------------------------ + +if(BUILD_TESTS OR BUILD_PRIMS_BENCH) + include(internal/CMakeLists.txt) +endif() + +# ################################################################################################## +# * build test executable ---------------------------------------------------- + +if(BUILD_TESTS) + include(test/CMakeLists.txt) +endif() + +# ################################################################################################## +# * build benchmark executable ----------------------------------------------- + +if(BUILD_PRIMS_BENCH) + include(bench/prims/CMakeLists.txt) +endif() + +# ################################################################################################## +# * build ann benchmark executable ----------------------------------------------- + +if(BUILD_ANN_BENCH) + include(bench/ann/CMakeLists.txt) +endif() diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt new file mode 100644 index 000000000..5919de07e --- /dev/null +++ b/cpp/bench/ann/CMakeLists.txt @@ -0,0 +1,380 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# ################################################################################################## +# * benchmark options ------------------------------------------------------------------------------ + +option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT + "Include faiss' cpu brute-force knn algorithm in benchmark" ON +) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON) + +option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark" + ON +) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) +option(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB "Include raft's CAGRA in benchmark" ON) +option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_SINGLE_EXE + "Make a single executable with benchmark as shared library modules" OFF +) + +# ################################################################################################## +# * Process options ---------------------------------------------------------- + +find_package(Threads REQUIRED) + +if(BUILD_CPU_ONLY) + + # Include necessary logging dependencies + include(cmake/thirdparty/get_fmt.cmake) + include(cmake/thirdparty/get_spdlog.cmake) + + set(RAFT_FAISS_ENABLE_GPU OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF) + set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF) + set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF) + set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF) + set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF) + set(RAFT_ANN_BENCH_USE_GGNN OFF) +else() + # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled. + # https://github.com/rapidsai/raft/issues/1627 + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0) + set(RAFT_FAISS_ENABLE_GPU OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF) + set(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ OFF) + set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT OFF) + else() + set(RAFT_FAISS_ENABLE_GPU ON) + endif() +endif() + +set(RAFT_ANN_BENCH_USE_FAISS OFF) +if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT + OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ + OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT + OR RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT + OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ + OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT +) + set(RAFT_ANN_BENCH_USE_FAISS ON) + set(RAFT_USE_FAISS_STATIC ON) +endif() + +set(RAFT_ANN_BENCH_USE_RAFT OFF) +if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ + OR RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE + OR RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT + OR RAFT_ANN_BENCH_USE_RAFT_CAGRA + OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB +) + set(RAFT_ANN_BENCH_USE_RAFT ON) +endif() + +# ################################################################################################## +# * Fetch requirements ------------------------------------------------------------- + +if(RAFT_ANN_BENCH_USE_HNSWLIB OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) + include(cmake/thirdparty/get_hnswlib.cmake) +endif() + +include(cmake/thirdparty/get_nlohmann_json.cmake) + +if(RAFT_ANN_BENCH_USE_GGNN) + include(cmake/thirdparty/get_ggnn.cmake) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS) + # We need to ensure that faiss has all the conda information. So we currently use the very ugly + # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory + # will have the conda includes/link dirs + link_libraries($) + include(cmake/thirdparty/get_faiss.cmake) +endif() + +# ################################################################################################## +# * Configure tests function------------------------------------------------------------- + +function(ConfigureAnnBench) + + set(oneValueArgs NAME) + set(multiValueArgs PATH LINKS CXXFLAGS INCLUDES) + + if(NOT BUILD_CPU_ONLY) + set(GPU_BUILD ON) + endif() + + cmake_parse_arguments( + ConfigureAnnBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} + ) + + set(BENCH_NAME ${ConfigureAnnBench_NAME}_ANN_BENCH) + + if(RAFT_ANN_BENCH_SINGLE_EXE) + add_library(${BENCH_NAME} SHARED ${ConfigureAnnBench_PATH}) + string(TOLOWER ${BENCH_NAME} BENCH_LIB_NAME) + set_target_properties(${BENCH_NAME} PROPERTIES OUTPUT_NAME ${BENCH_LIB_NAME}) + add_dependencies(${BENCH_NAME} ANN_BENCH) + else() + add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH}) + target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN) + target_link_libraries(${BENCH_NAME} PRIVATE benchmark::benchmark) + endif() + + target_link_libraries( + ${BENCH_NAME} + PRIVATE raft::raft + nlohmann_json::nlohmann_json + ${ConfigureAnnBench_LINKS} + Threads::Threads + $<$:${RAFT_CTK_MATH_DEPENDENCIES}> + $ + $ + -static-libgcc + -static-libstdc++ + $<$:fmt::fmt-header-only> + $<$:spdlog::spdlog_header_only> + ) + + set_target_properties( + ${BENCH_NAME} + PROPERTIES # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + ) + + set(${ConfigureAnnBench_CXXFLAGS} ${RAFT_CXX_FLAGS} ${ConfigureAnnBench_CXXFLAGS}) + + target_compile_options( + ${BENCH_NAME} PRIVATE "$<$:${ConfigureAnnBench_CXXFLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) + + if(RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}) + target_compile_definitions( + ${BENCH_NAME} + PUBLIC + RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}=RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME} + ) + endif() + + target_include_directories( + ${BENCH_NAME} + PUBLIC "$" + PRIVATE ${ConfigureAnnBench_INCLUDES} + ) + + install( + TARGETS ${BENCH_NAME} + COMPONENT ann_bench + DESTINATION bin/ann + ) +endfunction() + +# ################################################################################################## +# * Configure tests------------------------------------------------------------- + +if(RAFT_ANN_BENCH_USE_HNSWLIB) + ConfigureAnnBench( + NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES + ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS "${HNSW_CXX_FLAGS}" + ) +endif() + +if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) + ConfigureAnnBench( + NAME + RAFT_IVF_PQ + PATH + bench/ann/src/raft/raft_benchmark.cu + $<$:bench/ann/src/raft/raft_ivf_pq.cu> + LINKS + raft::compiled + ) +endif() + +if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT) + ConfigureAnnBench( + NAME + RAFT_IVF_FLAT + PATH + bench/ann/src/raft/raft_benchmark.cu + $<$:bench/ann/src/raft/raft_ivf_flat.cu> + LINKS + raft::compiled + ) +endif() + +if(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE) + ConfigureAnnBench( + NAME RAFT_BRUTE_FORCE PATH bench/ann/src/raft/raft_benchmark.cu LINKS raft::compiled + ) +endif() + +if(RAFT_ANN_BENCH_USE_RAFT_CAGRA) + ConfigureAnnBench( + NAME + RAFT_CAGRA + PATH + bench/ann/src/raft/raft_benchmark.cu + $<$:bench/ann/src/raft/raft_cagra.cu> + LINKS + raft::compiled + ) +endif() + +if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) + ConfigureAnnBench( + NAME + RAFT_CAGRA_HNSWLIB + PATH + bench/ann/src/raft/raft_cagra_hnswlib.cu + INCLUDES + ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib + LINKS + raft::compiled + CXXFLAGS + "${HNSW_CXX_FLAGS}" + ) +endif() + +set(RAFT_FAISS_TARGETS faiss::faiss) +if(TARGET faiss::faiss_avx2) + set(RAFT_FAISS_TARGETS faiss::faiss_avx2) +endif() + +message("RAFT_FAISS_TARGETS: ${RAFT_FAISS_TARGETS}") +message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}") +if(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT) + ConfigureAnnBench( + NAME FAISS_CPU_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_CPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_CPU_IVF_PQ PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_GGNN) + include(cmake/thirdparty/get_glog.cmake) + ConfigureAnnBench( + NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu INCLUDES + ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include LINKS glog::glog + ) +endif() + +# ################################################################################################## +# * Dynamically-loading ANN_BENCH executable ------------------------------------------------------- +if(RAFT_ANN_BENCH_SINGLE_EXE) + add_executable(ANN_BENCH bench/ann/src/common/benchmark.cpp) + + # Build and link static version of the GBench to keep ANN_BENCH self-contained. + get_target_property(TMP_PROP benchmark::benchmark SOURCES) + add_library(benchmark_static STATIC ${TMP_PROP}) + get_target_property(TMP_PROP benchmark::benchmark INCLUDE_DIRECTORIES) + target_include_directories(benchmark_static PUBLIC ${TMP_PROP}) + get_target_property(TMP_PROP benchmark::benchmark LINK_LIBRARIES) + target_link_libraries(benchmark_static PUBLIC ${TMP_PROP}) + + target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + + target_link_libraries( + ANN_BENCH PRIVATE nlohmann_json::nlohmann_json benchmark_static dl -static-libgcc + -static-libstdc++ CUDA::nvtx3 + ) + set_target_properties( + ANN_BENCH + PROPERTIES # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + ) + + # Disable NVTX when the nvtx3 headers are missing + set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES}) + get_target_property(CMAKE_REQUIRED_INCLUDES ANN_BENCH INCLUDE_DIRECTORIES) + CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND) + set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG}) + target_compile_definitions( + ANN_BENCH + PRIVATE + $<$:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH} + "> + $<$:ANN_BENCH_NVTX3_HEADERS_FOUND> + ) + + target_link_options(ANN_BENCH PRIVATE -export-dynamic) + + install( + TARGETS ANN_BENCH + COMPONENT ann_bench + DESTINATION bin/ann + EXCLUDE_FROM_ALL + ) +endif() diff --git a/cpp/bench/ann/README.md b/cpp/bench/ann/README.md new file mode 100644 index 000000000..1a8af2e44 --- /dev/null +++ b/cpp/bench/ann/README.md @@ -0,0 +1,3 @@ +# RAFT CUDA ANN Benchmarks + +Please see the [ANN Benchmarks](https://docs.rapids.ai/api/raft/stable/cuda_ann_benchmarks.html) section of the RAFT documentation for instructions on building and using the ANN benchmarks. \ No newline at end of file diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp new file mode 100644 index 000000000..e964a81ef --- /dev/null +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_stub.hpp" // cudaStream_t + +#include +#include +#include + +namespace raft::bench::ann { + +enum Objective { + THROUGHPUT, // See how many vectors we can push through + LATENCY // See how fast we can push a vector through +}; + +enum class MemoryType { + Host, + HostMmap, + Device, +}; + +enum class Metric { + kInnerProduct, + kEuclidean, +}; + +inline auto parse_metric(const std::string& metric_str) -> Metric +{ + if (metric_str == "inner_product") { + return raft::bench::ann::Metric::kInnerProduct; + } else if (metric_str == "euclidean") { + return raft::bench::ann::Metric::kEuclidean; + } else { + throw std::runtime_error("invalid metric: '" + metric_str + "'"); + } +} + +inline auto parse_memory_type(const std::string& memory_type) -> MemoryType +{ + if (memory_type == "host") { + return MemoryType::Host; + } else if (memory_type == "mmap") { + return MemoryType::HostMmap; + } else if (memory_type == "device") { + return MemoryType::Device; + } else { + throw std::runtime_error("invalid memory type: '" + memory_type + "'"); + } +} + +class AlgoProperty { + public: + inline AlgoProperty() {} + inline AlgoProperty(MemoryType dataset_memory_type_, MemoryType query_memory_type_) + : dataset_memory_type(dataset_memory_type_), query_memory_type(query_memory_type_) + { + } + MemoryType dataset_memory_type; + // neighbors/distances should have same memory type as queries + MemoryType query_memory_type; + virtual ~AlgoProperty() = default; +}; + +class AnnBase { + public: + inline AnnBase(Metric metric, int dim) : metric_(metric), dim_(dim) {} + virtual ~AnnBase() = default; + + protected: + Metric metric_; + int dim_; +}; + +template +class ANN : public AnnBase { + public: + struct AnnSearchParam { + Objective metric_objective = Objective::LATENCY; + virtual ~AnnSearchParam() = default; + [[nodiscard]] virtual auto needs_dataset() const -> bool { return false; }; + }; + + inline ANN(Metric metric, int dim) : AnnBase(metric, dim) {} + + virtual void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) = 0; + + virtual void set_search_param(const AnnSearchParam& param) = 0; + // TODO: this assumes that an algorithm can always return k results. + // This is not always possible. + virtual void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const = 0; + + virtual void save(const std::string& file) const = 0; + virtual void load(const std::string& file) = 0; + + virtual AlgoProperty get_preference() const = 0; + + // Some algorithms don't save the building dataset in their indices. + // So they should be given the access to that dataset during searching. + // The advantage of this way is that index has smaller size + // and many indices can share one dataset. + // + // SearchParam::needs_dataset() of such algorithm should be true, + // and set_search_dataset() should save the passed-in pointer somewhere. + // The client code should call set_search_dataset() before searching, + // and should not release dataset before searching is finished. + virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){}; +}; + +} // namespace raft::bench::ann + +#define REGISTER_ALGO_INSTANCE(DataT) \ + template auto raft::bench::ann::create_algo( \ + const std::string&, const std::string&, int, const nlohmann::json&, const std::vector&) \ + ->std::unique_ptr>; \ + template auto raft::bench::ann::create_search_param(const std::string&, \ + const nlohmann::json&) \ + ->std::unique_ptr::AnnSearchParam>; diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp new file mode 100644 index 000000000..6424a3647 --- /dev/null +++ b/cpp/bench/ann/src/common/benchmark.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "cuda_stub.hpp" // must go first + +#include "ann_types.hpp" + +#define JSON_DIAGNOSTICS 1 +#include + +#include +#include + +#include +#include + +namespace raft::bench::ann { + +struct lib_handle { + void* handle{nullptr}; + explicit lib_handle(const std::string& name) + { + handle = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (handle == nullptr) { + auto error_msg = "Failed to load " + name; + auto err = dlerror(); + if (err != nullptr && err[0] != '\0') { error_msg += ": " + std::string(err); } + throw std::runtime_error(error_msg); + } + } + ~lib_handle() noexcept + { + if (handle != nullptr) { dlclose(handle); } + } +}; + +auto load_lib(const std::string& algo) -> void* +{ + static std::unordered_map libs{}; + auto found = libs.find(algo); + + if (found != libs.end()) { return found->second.handle; } + auto lib_name = "lib" + algo + "_ann_bench.so"; + return libs.emplace(algo, lib_name).first->second.handle; +} + +auto get_fun_name(void* addr) -> std::string +{ + Dl_info dl_info; + if (dladdr(addr, &dl_info) != 0) { + if (dl_info.dli_sname != nullptr && dl_info.dli_sname[0] != '\0') { + return std::string{dl_info.dli_sname}; + } + } + throw std::logic_error("Failed to find out name of the looked up function"); +} + +template +auto create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) -> std::unique_ptr> +{ + static auto fname = get_fun_name(reinterpret_cast(&create_algo)); + auto handle = load_lib(algo); + auto fun_addr = dlsym(handle, fname.c_str()); + if (fun_addr == nullptr) { + throw std::runtime_error("Couldn't load the create_algo function (" + algo + ")"); + } + auto fun = reinterpret_cast)>(fun_addr); + return fun(algo, distance, dim, conf, dev_list); +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + static auto fname = get_fun_name(reinterpret_cast(&create_search_param)); + auto handle = load_lib(algo); + auto fun_addr = dlsym(handle, fname.c_str()); + if (fun_addr == nullptr) { + throw std::runtime_error("Couldn't load the create_search_param function (" + algo + ")"); + } + auto fun = reinterpret_cast)>(fun_addr); + return fun(algo, conf); +} + +}; // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#include "benchmark.hpp" + +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp new file mode 100644 index 000000000..a2e77323c --- /dev/null +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -0,0 +1,714 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ann_types.hpp" +#include "conf.hpp" +#include "dataset.hpp" +#include "util.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace raft::bench::ann { + +std::mutex init_mutex; +std::condition_variable cond_var; +std::atomic_int processed_threads{0}; + +static inline std::unique_ptr current_algo{nullptr}; +static inline std::shared_ptr current_algo_props{nullptr}; + +using kv_series = std::vector>>; + +inline auto apply_overrides(const std::vector& configs, + const kv_series& overrides, + std::size_t override_idx = 0) -> std::vector +{ + std::vector results{}; + if (override_idx >= overrides.size()) { + auto n = configs.size(); + for (size_t i = 0; i < n; i++) { + auto c = configs[i]; + c["override_suffix"] = n > 1 ? "/" + std::to_string(i) : ""; + results.push_back(c); + } + return results; + } + auto rec_configs = apply_overrides(configs, overrides, override_idx + 1); + auto [key, vals] = overrides[override_idx]; + auto n = vals.size(); + for (size_t i = 0; i < n; i++) { + const auto& val = vals[i]; + for (auto rc : rec_configs) { + if (n > 1) { + rc["override_suffix"] = + static_cast(rc["override_suffix"]) + "/" + std::to_string(i); + } + rc[key] = val; + results.push_back(rc); + } + } + return results; +} + +inline auto apply_overrides(const nlohmann::json& config, + const kv_series& overrides, + std::size_t override_idx = 0) +{ + return apply_overrides(std::vector{config}, overrides, 0); +} + +inline void dump_parameters(::benchmark::State& state, nlohmann::json params) +{ + std::string label = ""; + bool label_empty = true; + for (auto& [key, val] : params.items()) { + if (val.is_number()) { + state.counters.insert({{key, val}}); + } else if (val.is_boolean()) { + state.counters.insert({{key, val ? 1.0 : 0.0}}); + } else { + auto kv = key + "=" + val.dump(); + if (label_empty) { + label = kv; + } else { + label += "#" + kv; + } + label_empty = false; + } + } + if (!label_empty) { state.SetLabel(label); } +} + +inline auto parse_algo_property(AlgoProperty prop, const nlohmann::json& conf) -> AlgoProperty +{ + if (conf.contains("dataset_memory_type")) { + prop.dataset_memory_type = parse_memory_type(conf.at("dataset_memory_type")); + } + if (conf.contains("query_memory_type")) { + prop.query_memory_type = parse_memory_type(conf.at("query_memory_type")); + } + return prop; +}; + +template +void bench_build(::benchmark::State& state, + std::shared_ptr> dataset, + Configuration::Index index, + bool force_overwrite) +{ + dump_parameters(state, index.build_param); + if (file_exists(index.file)) { + if (force_overwrite) { + log_info("Overwriting file: %s", index.file.c_str()); + } else { + return state.SkipWithMessage( + "Index file already exists (use --force to overwrite the index)."); + } + } + + std::unique_ptr> algo; + try { + algo = ann::create_algo( + index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list); + } catch (const std::exception& e) { + return state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + } + + const auto algo_property = parse_algo_property(algo->get_preference(), index.build_param); + + const T* base_set = dataset->base_set(algo_property.dataset_memory_type); + std::size_t index_size = dataset->base_set_size(); + + cuda_timer gpu_timer; + { + nvtx_case nvtx{state.name()}; + for (auto _ : state) { + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + try { + algo->build(base_set, index_size, gpu_timer.stream()); + } catch (const std::exception& e) { + state.SkipWithError(std::string(e.what())); + } + } + } + state.counters.insert( + {{"GPU", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}}); + + if (state.skipped()) { return; } + make_sure_parent_dir_exists(index.file); + algo->save(index.file); +} + +template +void bench_search(::benchmark::State& state, + Configuration::Index index, + std::size_t search_param_ix, + std::shared_ptr> dataset, + Objective metric_objective) +{ + std::size_t queries_processed = 0; + + const auto& sp_json = index.search_params[search_param_ix]; + + if (state.thread_index() == 0) { dump_parameters(state, sp_json); } + + // NB: `k` and `n_queries` are guaranteed to be populated in conf.cpp + const std::uint32_t k = sp_json["k"]; + // Amount of data processes in one go + const std::size_t n_queries = sp_json["n_queries"]; + // Round down the query data to a multiple of the batch size to loop over full batches of data + const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries; + + if (dataset->query_set_size() < n_queries) { + std::stringstream msg; + msg << "Not enough queries in benchmark set. Expected " << n_queries << ", actual " + << dataset->query_set_size(); + return state.SkipWithError(msg.str()); + } + + // Each thread start from a different offset, so that the queries that they process do not + // overlap. + std::ptrdiff_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + std::ptrdiff_t queries_stride = state.threads() * n_queries; + // Output is saved into a contiguous buffer (separate buffers for each thread). + std::ptrdiff_t out_offset = 0; + + const T* query_set = nullptr; + + if (!file_exists(index.file)) { + state.SkipWithError("Index file is missing. Run the benchmark in the build mode first."); + return; + } + + /** + * Make sure the first thread loads the algo and dataset + */ + if (state.thread_index() == 0) { + std::unique_lock lk(init_mutex); + cond_var.wait(lk, [] { return processed_threads.load(std::memory_order_acquire) == 0; }); + // algo is static to cache it between close search runs to save time on index loading + static std::string index_file = ""; + if (index.file != index_file) { + current_algo.reset(); + index_file = index.file; + } + + std::unique_ptr::AnnSearchParam> search_param; + ANN* algo; + try { + if (!current_algo || (algo = dynamic_cast*>(current_algo.get())) == nullptr) { + auto ualgo = ann::create_algo( + index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list); + algo = ualgo.get(); + algo->load(index_file); + current_algo = std::move(ualgo); + } + search_param = ann::create_search_param(index.algo, sp_json); + search_param->metric_objective = metric_objective; + } catch (const std::exception& e) { + state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + return; + } + + auto algo_property = parse_algo_property(algo->get_preference(), sp_json); + current_algo_props = std::make_shared(algo_property.dataset_memory_type, + algo_property.query_memory_type); + + if (search_param->needs_dataset()) { + try { + algo->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type), + dataset->base_set_size()); + } catch (const std::exception& ex) { + state.SkipWithError("The algorithm '" + index.name + + "' requires the base set, but it's not available. " + + "Exception: " + std::string(ex.what())); + return; + } + } + try { + algo->set_search_param(*search_param); + + } catch (const std::exception& ex) { + state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what())); + return; + } + + query_set = dataset->query_set(current_algo_props->query_memory_type); + processed_threads.store(state.threads(), std::memory_order_acq_rel); + cond_var.notify_all(); + } else { + std::unique_lock lk(init_mutex); + // All other threads will wait for the first thread to initialize the algo. + cond_var.wait(lk, [&state] { + return processed_threads.load(std::memory_order_acquire) == state.threads(); + }); + // gbench ensures that all threads are synchronized at the start of the benchmark loop. + // We are accessing shared variables (like current_algo, current_algo_probs) before the + // benchmark loop, therefore the synchronization here is necessary. + } + const auto algo_property = *current_algo_props; + query_set = dataset->query_set(algo_property.query_memory_type); + + /** + * Each thread will manage its own outputs + */ + std::shared_ptr> distances = + std::make_shared>(algo_property.query_memory_type, k * query_set_size); + std::shared_ptr> neighbors = + std::make_shared>(algo_property.query_memory_type, k * query_set_size); + + cuda_timer gpu_timer; + auto start = std::chrono::high_resolution_clock::now(); + { + nvtx_case nvtx{state.name()}; + + ANN* algo = dynamic_cast*>(current_algo.get()); + for (auto _ : state) { + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + + // run the search + try { + algo->search(query_set + batch_offset * dataset->dim(), + n_queries, + k, + neighbors->data + out_offset * k, + distances->data + out_offset * k, + gpu_timer.stream()); + } catch (const std::exception& e) { + state.SkipWithError(std::string(e.what())); + } + + // advance to the next batch + batch_offset = (batch_offset + queries_stride) % query_set_size; + out_offset = (out_offset + n_queries) % query_set_size; + + queries_processed += n_queries; + } + } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end - start).count(); + if (state.thread_index() == 0) { state.counters.insert({{"end_to_end", duration}}); } + state.counters.insert( + {"Latency", {duration / double(state.iterations()), benchmark::Counter::kAvgThreads}}); + + state.SetItemsProcessed(queries_processed); + if (cudart.found()) { + double gpu_time_per_iteration = gpu_timer.total_time() / (double)state.iterations(); + state.counters.insert({"GPU", {gpu_time_per_iteration, benchmark::Counter::kAvgThreads}}); + } + + // This will be the total number of queries across all threads + state.counters.insert({{"total_queries", queries_processed}}); + + if (state.skipped()) { return; } + + // assume thread has finished processing successfully at this point + // last thread to finish processing notifies all + if (processed_threads-- == 0) { cond_var.notify_all(); } + + // Each thread calculates recall on their partition of queries. + // evaluate recall + if (dataset->max_k() >= k) { + const std::int32_t* gt = dataset->gt_set(); + const std::uint32_t max_k = dataset->max_k(); + buf neighbors_host = neighbors->move(MemoryType::Host); + std::size_t rows = std::min(queries_processed, query_set_size); + std::size_t match_count = 0; + std::size_t total_count = rows * static_cast(k); + + // We go through the groundtruth with same stride as the benchmark loop. + size_t out_offset = 0; + size_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + while (out_offset < rows) { + for (std::size_t i = 0; i < n_queries; i++) { + size_t i_orig_idx = batch_offset + i; + size_t i_out_idx = out_offset + i; + if (i_out_idx < rows) { + for (std::uint32_t j = 0; j < k; j++) { + auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]); + for (std::uint32_t l = 0; l < k; l++) { + auto exp_idx = gt[i_orig_idx * max_k + l]; + if (act_idx == exp_idx) { + match_count++; + break; + } + } + } + } + } + out_offset += n_queries; + batch_offset = (batch_offset + queries_stride) % query_set_size; + } + double actual_recall = static_cast(match_count) / static_cast(total_count); + state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}}); + } +} + +inline void printf_usage() +{ + ::benchmark::PrintDefaultHelp(); + fprintf(stdout, + " [--build|--search] \n" + " [--force]\n" + " [--data_prefix=]\n" + " [--index_prefix=]\n" + " [--override_kv=]\n" + " [--mode=\n" + " [--threads=min[:max]]\n" + " .json\n" + "\n" + "Note the non-standard benchmark parameters:\n" + " --build: build mode, will build index\n" + " --search: search mode, will search using the built index\n" + " one and only one of --build and --search should be specified\n" + " --force: force overwriting existing index files\n" + " --data_prefix=:" + " prepend to dataset file paths specified in the .json (default = " + "'data/').\n" + " --index_prefix=:" + " prepend to index file paths specified in the .json (default = " + "'index/').\n" + " --override_kv=:" + " override a build/search key one or more times multiplying the number of configurations;" + " you can use this parameter multiple times to get the Cartesian product of benchmark" + " configs.\n" + " --mode=" + " run the benchmarks in latency (accumulate times spent in each batch) or " + " throughput (pipeline batches and measure end-to-end) mode\n" + " --threads=min[:max] specify the number threads to use for throughput benchmark." + " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified," + " then a single test is run with 'min' threads. By default min=1, max=.\n"); +} + +template +void register_build(std::shared_ptr> dataset, + std::vector indices, + bool force_overwrite) +{ + for (auto index : indices) { + auto suf = static_cast(index.build_param["override_suffix"]); + auto file_suf = suf; + index.build_param.erase("override_suffix"); + std::replace(file_suf.begin(), file_suf.end(), '/', '-'); + index.file += file_suf; + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_build, dataset, index, force_overwrite); + b->Unit(benchmark::kSecond); + b->MeasureProcessCPUTime(); + b->UseRealTime(); + } +} + +template +void register_search(std::shared_ptr> dataset, + std::vector indices, + Objective metric_objective, + const std::vector& threads) +{ + for (auto index : indices) { + for (std::size_t i = 0; i < index.search_params.size(); i++) { + auto suf = static_cast(index.search_params[i]["override_suffix"]); + index.search_params[i].erase("override_suffix"); + + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_search, index, i, dataset, metric_objective) + ->Unit(benchmark::kMillisecond) + /** + * The following are important for getting accuracy QPS measurements on both CPU + * and GPU These make sure that + * - `end_to_end` ~ (`Time` * `Iterations`) + * - `items_per_second` ~ (`total_queries` / `end_to_end`) + * - Throughput = `items_per_second` + */ + ->MeasureProcessCPUTime() + ->UseRealTime(); + + if (metric_objective == Objective::THROUGHPUT) { b->ThreadRange(threads[0], threads[1]); } + } + } +} + +template +void dispatch_benchmark(const Configuration& conf, + bool force_overwrite, + bool build_mode, + bool search_mode, + std::string data_prefix, + std::string index_prefix, + kv_series override_kv, + Objective metric_objective, + const std::vector& threads) +{ + if (cudart.found()) { + for (auto [key, value] : cuda_info()) { + ::benchmark::AddCustomContext(key, value); + } + } + const auto dataset_conf = conf.get_dataset_conf(); + auto base_file = combine_path(data_prefix, dataset_conf.base_file); + auto query_file = combine_path(data_prefix, dataset_conf.query_file); + auto gt_file = dataset_conf.groundtruth_neighbors_file; + if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); } + auto dataset = std::make_shared>(dataset_conf.name, + base_file, + dataset_conf.subset_first_row, + dataset_conf.subset_size, + query_file, + dataset_conf.distance, + gt_file); + ::benchmark::AddCustomContext("dataset", dataset_conf.name); + ::benchmark::AddCustomContext("distance", dataset_conf.distance); + std::vector indices = conf.get_indices(); + if (build_mode) { + if (file_exists(base_file)) { + log_info("Using the dataset file '%s'", base_file.c_str()); + ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + } else { + log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.", + base_file.c_str()); + } + std::vector more_indices{}; + for (auto& index : indices) { + for (auto param : apply_overrides(index.build_param, override_kv)) { + auto modified_index = index; + modified_index.build_param = param; + modified_index.file = combine_path(index_prefix, modified_index.file); + more_indices.push_back(modified_index); + } + } + register_build(dataset, more_indices, force_overwrite); + } else if (search_mode) { + if (file_exists(query_file)) { + log_info("Using the query file '%s'", query_file.c_str()); + ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + if (gt_file.has_value()) { + if (file_exists(*gt_file)) { + log_info("Using the ground truth file '%s'", gt_file->c_str()); + ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k())); + } else { + log_warn("Ground truth file '%s' does not exist; the recall won't be reported.", + gt_file->c_str()); + } + } else { + log_warn( + "Ground truth file is not provided; the recall won't be reported. NB: use " + "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the " + "path to " + "the ground truth in your conf.json."); + } + } else { + log_warn("Query file '%s' does not exist; benchmarking search is impossible.", + query_file.c_str()); + } + for (auto& index : indices) { + index.search_params = apply_overrides(index.search_params, override_kv); + index.file = combine_path(index_prefix, index.file); + } + register_search(dataset, indices, metric_objective, threads); + } +} + +inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool +{ + if (strcmp(arg, pat) == 0) { + result = true; + return true; + } + return false; +} + +inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool +{ + auto n = strlen(pat); + if (strncmp(pat, arg, strlen(pat)) == 0) { + result = arg + n + 1; + return true; + } + return false; +} + +inline auto run_main(int argc, char** argv) -> int +{ + bool force_overwrite = false; + bool build_mode = false; + bool search_mode = false; + std::string data_prefix = "data"; + std::string index_prefix = "index"; + std::string new_override_kv = ""; + std::string mode = "latency"; + std::string threads_arg_txt = ""; + std::vector threads = {1, -1}; // min_thread, max_thread + std::string log_level_str = ""; + int raft_log_level = raft::logger::get(RAFT_NAME).get_level(); + kv_series override_kv{}; + + char arg0_default[] = "benchmark"; // NOLINT + char* args_default = arg0_default; + if (!argv) { + argc = 1; + argv = &args_default; + } + if (argc == 1) { + printf_usage(); + return -1; + } + + char* conf_path = argv[--argc]; + std::ifstream conf_stream(conf_path); + + for (int i = 1; i < argc; i++) { + if (parse_bool_flag(argv[i], "--force", force_overwrite) || + parse_bool_flag(argv[i], "--build", build_mode) || + parse_bool_flag(argv[i], "--search", search_mode) || + parse_string_flag(argv[i], "--data_prefix", data_prefix) || + parse_string_flag(argv[i], "--index_prefix", index_prefix) || + parse_string_flag(argv[i], "--mode", mode) || + parse_string_flag(argv[i], "--override_kv", new_override_kv) || + parse_string_flag(argv[i], "--threads", threads_arg_txt) || + parse_string_flag(argv[i], "--raft_log_level", log_level_str)) { + if (!log_level_str.empty()) { + raft_log_level = std::stoi(log_level_str); + log_level_str = ""; + } + if (!threads_arg_txt.empty()) { + auto threads_arg = split(threads_arg_txt, ':'); + threads[0] = std::stoi(threads_arg[0]); + if (threads_arg.size() > 1) { + threads[1] = std::stoi(threads_arg[1]); + } else { + threads[1] = threads[0]; + } + threads_arg_txt = ""; + } + if (!new_override_kv.empty()) { + auto kvv = split(new_override_kv, ':'); + auto key = kvv[0]; + std::vector vals{}; + for (std::size_t j = 1; j < kvv.size(); j++) { + vals.push_back(nlohmann::json::parse(kvv[j])); + } + override_kv.emplace_back(key, vals); + new_override_kv = ""; + } + for (int j = i; j < argc - 1; j++) { + argv[j] = argv[j + 1]; + } + argc--; + i--; + } + } + + raft::logger::get(RAFT_NAME).set_level(raft_log_level); + + Objective metric_objective = Objective::LATENCY; + if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } + + int max_threads = + (metric_objective == Objective::THROUGHPUT) ? std::thread::hardware_concurrency() : 1; + if (threads[1] == -1) threads[1] = max_threads; + + if (metric_objective == Objective::LATENCY) { + if (threads[0] != 1 || threads[1] != 1) { + log_warn("Latency mode enabled. Overriding threads arg, running with single thread."); + threads = {1, 1}; + } + } + + if (build_mode == search_mode) { + log_error("One and only one of --build and --search should be specified"); + printf_usage(); + return -1; + } + + if (!conf_stream) { + log_error("Can't open configuration file: %s", conf_path); + return -1; + } + + if (cudart.needed() && !cudart.found()) { + log_warn("cudart library is not found, GPU-based indices won't work."); + } + + Configuration conf(conf_stream); + std::string dtype = conf.get_dataset_conf().dtype; + + if (dtype == "float") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); + } else if (dtype == "uint8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); + } else if (dtype == "int8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); + } else { + log_error("datatype '%s' is not supported", dtype.c_str()); + return -1; + } + + ::benchmark::Initialize(&argc, argv, printf_usage); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + // Release a possibly cached ANN object, so that it cannot be alive longer than the handle + // to a shared library it depends on (dynamic benchmark executable). + current_algo.reset(); + return 0; +} +}; // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/common/conf.hpp b/cpp/bench/ann/src/common/conf.hpp new file mode 100644 index 000000000..405b00a74 --- /dev/null +++ b/cpp/bench/ann/src/common/conf.hpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "util.hpp" + +#include +#include +#include +#include +#include + +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +class Configuration { + public: + struct Index { + std::string name; + std::string algo; + nlohmann::json build_param; + std::string file; + std::vector dev_list; + + int batch_size; + int k; + std::vector search_params; + }; + + struct DatasetConf { + std::string name; + std::string base_file; + // use only a subset of base_file, + // the range of rows is [subset_first_row, subset_first_row + subset_size) + // however, subset_size = 0 means using all rows after subset_first_row + // that is, the subset is [subset_first_row, #rows in base_file) + size_t subset_first_row{0}; + size_t subset_size{0}; + std::string query_file; + std::string distance; + std::optional groundtruth_neighbors_file{std::nullopt}; + + // data type of input dataset, possible values ["float", "int8", "uint8"] + std::string dtype; + }; + + explicit inline Configuration(std::istream& conf_stream) + { + // to enable comments in json + auto conf = nlohmann::json::parse(conf_stream, nullptr, true, true); + + parse_dataset_(conf.at("dataset")); + parse_index_(conf.at("index"), conf.at("search_basic_param")); + } + + [[nodiscard]] inline auto get_dataset_conf() const -> DatasetConf { return dataset_conf_; } + [[nodiscard]] inline auto get_indices() const -> std::vector { return indices_; }; + + private: + inline void parse_dataset_(const nlohmann::json& conf) + { + dataset_conf_.name = conf.at("name"); + dataset_conf_.base_file = conf.at("base_file"); + dataset_conf_.query_file = conf.at("query_file"); + dataset_conf_.distance = conf.at("distance"); + + if (conf.contains("groundtruth_neighbors_file")) { + dataset_conf_.groundtruth_neighbors_file = conf.at("groundtruth_neighbors_file"); + } + if (conf.contains("subset_first_row")) { + dataset_conf_.subset_first_row = conf.at("subset_first_row"); + } + if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); } + + if (conf.contains("dtype")) { + dataset_conf_.dtype = conf.at("dtype"); + } else { + auto filename = dataset_conf_.base_file; + if (!filename.compare(filename.size() - 4, 4, "fbin")) { + dataset_conf_.dtype = "float"; + } else if (!filename.compare(filename.size() - 5, 5, "u8bin")) { + dataset_conf_.dtype = "uint8"; + } else if (!filename.compare(filename.size() - 5, 5, "i8bin")) { + dataset_conf_.dtype = "int8"; + } else { + log_error("Could not determine data type of the dataset %s", filename.c_str()); + } + } + } + inline void parse_index_(const nlohmann::json& index_conf, + const nlohmann::json& search_basic_conf) + { + const int batch_size = search_basic_conf.at("batch_size"); + const int k = search_basic_conf.at("k"); + + for (const auto& conf : index_conf) { + Index index; + index.name = conf.at("name"); + index.algo = conf.at("algo"); + index.build_param = conf.at("build_param"); + index.file = conf.at("file"); + index.batch_size = batch_size; + index.k = k; + + if (conf.contains("multigpu")) { + for (auto it : conf.at("multigpu")) { + index.dev_list.push_back(it); + } + if (index.dev_list.empty()) { throw std::runtime_error("dev_list shouln't be empty!"); } + index.dev_list.shrink_to_fit(); + index.build_param["multigpu"] = conf["multigpu"]; + } + + for (auto param : conf.at("search_params")) { + /* ### Special parameters for backward compatibility ### + + - Local values of `k` and `n_queries` take priority. + - The legacy "batch_size" renamed to `n_queries`. + - Basic search params are used otherwise. + */ + if (!param.contains("k")) { param["k"] = k; } + if (!param.contains("n_queries")) { + if (param.contains("batch_size")) { + param["n_queries"] = param["batch_size"]; + param.erase("batch_size"); + } else { + param["n_queries"] = batch_size; + } + } + index.search_params.push_back(param); + } + + indices_.push_back(index); + } + } + + DatasetConf dataset_conf_; + std::vector indices_; +}; + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp new file mode 100644 index 000000000..9132db7c0 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace raft::mr { +/** + * @brief `device_memory_resource` derived class that uses mmap to allocate memory. + * This class enables memory allocation using huge pages. + * It is assumed that the allocated memory is directly accessible on device. This currently only + * works on GH systems. + * + * TODO(tfeher): consider improving or removing this helper once we made progress with + * https://github.com/rapidsai/raft/issues/1819 + */ +class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { + public: + cuda_huge_page_resource() = default; + ~cuda_huge_page_resource() override = default; + cuda_huge_page_resource(cuda_huge_page_resource const&) = default; + cuda_huge_page_resource(cuda_huge_page_resource&&) = default; + cuda_huge_page_resource& operator=(cuda_huge_page_resource const&) = default; + cuda_huge_page_resource& operator=(cuda_huge_page_resource&&) = default; + + /** + * @brief Query whether the resource supports use of non-null CUDA streams for + * allocation/deallocation. `cuda_huge_page_resource` does not support streams. + * + * @returns bool false + */ + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return true + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return true; } + + private: + /** + * @brief Allocates memory of size at least `bytes` using cudaMalloc. + * + * The returned pointer has at least 256B alignment. + * + * @note Stream argument is ignored + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * + * @param bytes The size, in bytes, of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view) override + { + void* _addr{nullptr}; + _addr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (_addr == MAP_FAILED) { RAFT_FAIL("huge_page_resource::MAP FAILED"); } + if (madvise(_addr, bytes, MADV_HUGEPAGE) == -1) { + munmap(_addr, bytes); + RAFT_FAIL("huge_page_resource::madvise MADV_HUGEPAGE"); + } + memset(_addr, 0, bytes); + return _addr; + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @note Stream argument is ignored. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + */ + void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) override + { + if (munmap(ptr, size) == -1) { RAFT_FAIL("huge_page_resource::munmap"); } + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_huge_page_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @throws Nothing. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + return dynamic_cast(&other) != nullptr; + } + + /** + * @brief Get free and available memory for memory resource + * + * @throws `rmm::cuda_error` if unable to retrieve memory info. + * + * @return std::pair contaiing free_size and total_size of memory + */ + [[nodiscard]] std::pair do_get_mem_info( + rmm::cuda_stream_view) const override + { + std::size_t free_size{}; + std::size_t total_size{}; + RMM_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + return std::make_pair(free_size, total_size); + } +}; +} // namespace raft::mr \ No newline at end of file diff --git a/cpp/bench/ann/src/common/cuda_pinned_resource.hpp b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp new file mode 100644 index 000000000..28ca691f8 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +namespace raft::mr { +/** + * @brief `device_memory_resource` derived class that uses cudaMallocHost/Free for + * allocation/deallocation. + * + * This is almost the same as rmm::mr::host::pinned_memory_resource, but it has + * device_memory_resource as base class. Pinned memory can be accessed from device, + * and using this allocator we can create device_mdarray backed by pinned allocator. + * + * TODO(tfeher): it would be preferred to just rely on the existing allocator from rmm + * (pinned_memory_resource), but that is incompatible with the container_policy class + * for device matrix, because the latter expects a device_memory_resource. We shall + * revise this once we progress with Issue https://github.com/rapidsai/raft/issues/1819 + */ +class cuda_pinned_resource final : public rmm::mr::device_memory_resource { + public: + cuda_pinned_resource() = default; + ~cuda_pinned_resource() override = default; + cuda_pinned_resource(cuda_pinned_resource const&) = default; + cuda_pinned_resource(cuda_pinned_resource&&) = default; + cuda_pinned_resource& operator=(cuda_pinned_resource const&) = default; + cuda_pinned_resource& operator=(cuda_pinned_resource&&) = default; + + /** + * @brief Query whether the resource supports use of non-null CUDA streams for + * allocation/deallocation. `cuda_pinned_resource` does not support streams. + * + * @returns bool false + */ + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return true + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return true; } + + private: + /** + * @brief Allocates memory of size at least `bytes` using cudaMalloc. + * + * The returned pointer has at least 256B alignment. + * + * @note Stream argument is ignored + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * + * @param bytes The size, in bytes, of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view) override + { + void* ptr{nullptr}; + RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, bytes)); + return ptr; + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @note Stream argument is ignored. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + */ + void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view) override + { + RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_pinned_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @throws Nothing. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + return dynamic_cast(&other) != nullptr; + } + + /** + * @brief Get free and available memory for memory resource + * + * @throws `rmm::cuda_error` if unable to retrieve memory info. + * + * @return std::pair contaiing free_size and total_size of memory + */ + [[nodiscard]] std::pair do_get_mem_info( + rmm::cuda_stream_view) const override + { + std::size_t free_size{}; + std::size_t total_size{}; + RMM_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + return std::make_pair(free_size, total_size); + } +}; +} // namespace raft::mr \ No newline at end of file diff --git a/cpp/bench/ann/src/common/cuda_stub.hpp b/cpp/bench/ann/src/common/cuda_stub.hpp new file mode 100644 index 000000000..6e3b63cd3 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_stub.hpp @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +/* +The content of this header is governed by two preprocessor definitions: + + - BUILD_CPU_ONLY - whether none of the CUDA functions are used. + - ANN_BENCH_LINK_CUDART - dynamically link against this string if defined. + +______________________________________________________________________________ +|BUILD_CPU_ONLY | ANN_BENCH_LINK_CUDART | cudart | cuda_runtime_api.h | +| | | found | needed | included | +|---------|-----------------------|-----------|---------|--------------------| +| ON | | false | false | NO | +| ON | "cudart.so.xx.xx" | false | false | NO | +| OFF | | true | true | YES | +| OFF | "cudart.so.xx.xx" | | true | YES | +------------------------------------------------------------------------------ +*/ + +#ifndef BUILD_CPU_ONLY +#include +#ifdef ANN_BENCH_LINK_CUDART +#include +#include +#endif +#else +typedef void* cudaStream_t; +typedef void* cudaEvent_t; +#endif + +namespace raft::bench::ann { + +struct cuda_lib_handle { + void* handle{nullptr}; + explicit cuda_lib_handle() + { +#ifdef ANN_BENCH_LINK_CUDART + constexpr int kFlags = RTLD_NOW | RTLD_GLOBAL | RTLD_DEEPBIND | RTLD_NODELETE; + // The full name of the linked cudart library 'cudart.so.MAJOR.MINOR.PATCH' + char libname[] = ANN_BENCH_LINK_CUDART; // NOLINT + handle = dlopen(ANN_BENCH_LINK_CUDART, kFlags); + if (handle != nullptr) { return; } + // try strip the PATCH + auto p = strrchr(libname, '.'); + p[0] = 0; + handle = dlopen(libname, kFlags); + if (handle != nullptr) { return; } + // try set the MINOR version to 0 + p = strrchr(libname, '.'); + p[1] = '0'; + p[2] = 0; + handle = dlopen(libname, kFlags); + if (handle != nullptr) { return; } + // try strip the MINOR + p[0] = 0; + handle = dlopen(libname, kFlags); + if (handle != nullptr) { return; } + // try strip the MAJOR + p = strrchr(libname, '.'); + p[0] = 0; + handle = dlopen(libname, kFlags); +#endif + } + ~cuda_lib_handle() noexcept + { +#ifdef ANN_BENCH_LINK_CUDART + if (handle != nullptr) { dlclose(handle); } +#endif + } + + template + auto sym(const char* name) -> Symbol + { +#ifdef ANN_BENCH_LINK_CUDART + return reinterpret_cast(dlsym(handle, name)); +#else + return nullptr; +#endif + } + + /** Whether this is NOT a cpu-only package. */ + [[nodiscard]] constexpr inline auto needed() const -> bool + { +#if defined(BUILD_CPU_ONLY) + return false; +#else + return true; +#endif + } + + /** CUDA found, either at compile time or at runtime. */ + [[nodiscard]] inline auto found() const -> bool + { +#if defined(BUILD_CPU_ONLY) + return false; +#elif defined(ANN_BENCH_LINK_CUDART) + return handle != nullptr; +#else + return true; +#endif + } +}; + +static inline cuda_lib_handle cudart{}; + +#ifdef ANN_BENCH_LINK_CUDART +namespace stub { + +[[gnu::weak, gnu::noinline]] cudaError_t cudaMemcpy(void* dst, + const void* src, + size_t count, + enum cudaMemcpyKind kind) +{ + return cudaSuccess; +} + +[[gnu::weak, gnu::noinline]] cudaError_t cudaMalloc(void** ptr, size_t size) +{ + *ptr = nullptr; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaMemset(void* devPtr, int value, size_t count) +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaFree(void* devPtr) { return cudaSuccess; } +[[gnu::weak, gnu::noinline]] cudaError_t cudaStreamCreate(cudaStream_t* pStream) +{ + *pStream = 0; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, + unsigned int flags) +{ + *pStream = 0; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaStreamDestroy(cudaStream_t pStream) +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } + +[[gnu::weak, gnu::noinline]] cudaError_t cudaStreamSynchronize(cudaStream_t pStream) +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaEventCreate(cudaEvent_t* event) +{ + *event = 0; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaEventSynchronize(cudaEvent_t event) +{ + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaEventElapsedTime(float* ms, + cudaEvent_t start, + cudaEvent_t end) +{ + *ms = 0; + return cudaSuccess; +} +[[gnu::weak, gnu::noinline]] cudaError_t cudaEventDestroy(cudaEvent_t event) { return cudaSuccess; } +[[gnu::weak, gnu::noinline]] cudaError_t cudaGetDevice(int* device) +{ + *device = 0; + return cudaSuccess; +}; +[[gnu::weak, gnu::noinline]] cudaError_t cudaDriverGetVersion(int* driver) +{ + *driver = 0; + return cudaSuccess; +}; +[[gnu::weak, gnu::noinline]] cudaError_t cudaRuntimeGetVersion(int* runtime) +{ + *runtime = 0; + return cudaSuccess; +}; +[[gnu::weak, gnu::noinline]] cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, + int device) +{ + *prop = cudaDeviceProp{}; + return cudaSuccess; +} + +} // namespace stub + +#define RAFT_DECLARE_CUDART(fun) \ + static inline decltype(&stub::fun) fun = \ + cudart.found() ? cudart.sym(#fun) : &stub::fun + +RAFT_DECLARE_CUDART(cudaMemcpy); +RAFT_DECLARE_CUDART(cudaMalloc); +RAFT_DECLARE_CUDART(cudaMemset); +RAFT_DECLARE_CUDART(cudaFree); +RAFT_DECLARE_CUDART(cudaStreamCreate); +RAFT_DECLARE_CUDART(cudaStreamCreateWithFlags); +RAFT_DECLARE_CUDART(cudaStreamDestroy); +RAFT_DECLARE_CUDART(cudaDeviceSynchronize); +RAFT_DECLARE_CUDART(cudaStreamSynchronize); +RAFT_DECLARE_CUDART(cudaEventCreate); +RAFT_DECLARE_CUDART(cudaEventRecord); +RAFT_DECLARE_CUDART(cudaEventSynchronize); +RAFT_DECLARE_CUDART(cudaEventElapsedTime); +RAFT_DECLARE_CUDART(cudaEventDestroy); +RAFT_DECLARE_CUDART(cudaGetDevice); +RAFT_DECLARE_CUDART(cudaDriverGetVersion); +RAFT_DECLARE_CUDART(cudaRuntimeGetVersion); +RAFT_DECLARE_CUDART(cudaGetDeviceProperties); + +#undef RAFT_DECLARE_CUDART +#endif + +}; // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp new file mode 100644 index 000000000..ccc5915b3 --- /dev/null +++ b/cpp/bench/ann/src/common/dataset.hpp @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "util.hpp" + +#ifndef BUILD_CPU_ONLY +#include +#else +typedef uint16_t half; +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::ann { + +// http://big-ann-benchmarks.com/index.html: +// binary format that starts with 8 bytes of data consisting of num_points(uint32_t) +// num_dimensions(uint32) followed by num_pts x num_dimensions x sizeof(type) bytes of +// data stored one vector after another. +// Data files will have suffixes .fbin, .u8bin, and .i8bin to represent float32, uint8 +// and int8 type data. +// As extensions for this benchmark, half and int data files will have suffixes .f16bin +// and .ibin, respectively. +template +class BinFile { + public: + BinFile(const std::string& file, + const std::string& mode, + uint32_t subset_first_row = 0, + uint32_t subset_size = 0); + ~BinFile() + { + if (mapped_ptr_ != nullptr) { unmap(); } + if (fp_ != nullptr) { fclose(fp_); } + } + BinFile(const BinFile&) = delete; + BinFile& operator=(const BinFile&) = delete; + + void get_shape(size_t* nrows, int* ndims) const + { + assert(read_mode_); + if (!fp_) { open_file_(); } + *nrows = nrows_; + *ndims = ndims_; + } + + void read(T* data) const + { + assert(read_mode_); + if (!fp_) { open_file_(); } + size_t total = static_cast(nrows_) * ndims_; + if (fread(data, sizeof(T), total, fp_) != total) { + throw std::runtime_error("fread() BinFile " + file_ + " failed"); + } + } + + void write(const T* data, uint32_t nrows, uint32_t ndims) + { + assert(!read_mode_); + if (!fp_) { open_file_(); } + if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) { + throw std::runtime_error("fwrite() BinFile " + file_ + " failed"); + } + if (fwrite(&ndims, sizeof(uint32_t), 1, fp_) != 1) { + throw std::runtime_error("fwrite() BinFile " + file_ + " failed"); + } + + size_t total = static_cast(nrows) * ndims; + if (fwrite(data, sizeof(T), total, fp_) != total) { + throw std::runtime_error("fwrite() BinFile " + file_ + " failed"); + } + } + + T* map() const + { + assert(read_mode_); + if (!fp_) { open_file_(); } + int fid = fileno(fp_); + mapped_ptr_ = mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fid, 0); + if (mapped_ptr_ == MAP_FAILED) { + mapped_ptr_ = nullptr; + throw std::runtime_error("mmap error: Value of errno " + std::to_string(errno) + ", " + + std::string(strerror(errno))); + } + return reinterpret_cast(reinterpret_cast(mapped_ptr_) + 2 * sizeof(uint32_t) + + subset_first_row_ * ndims_ * sizeof(T)); + } + + void unmap() const + { + if (munmap(mapped_ptr_, file_size_) == -1) { + throw std::runtime_error("munmap error: " + std::string(strerror(errno))); + } + } + + private: + void check_suffix_(); + void open_file_() const; + + std::string file_; + bool read_mode_; + uint32_t subset_first_row_; + uint32_t subset_size_; + + mutable FILE* fp_{nullptr}; + mutable uint32_t nrows_; + mutable uint32_t ndims_; + mutable size_t file_size_; + mutable void* mapped_ptr_{nullptr}; +}; + +template +BinFile::BinFile(const std::string& file, + const std::string& mode, + uint32_t subset_first_row, + uint32_t subset_size) + : file_(file), + read_mode_(mode == "r"), + subset_first_row_(subset_first_row), + subset_size_(subset_size), + fp_(nullptr) +{ + check_suffix_(); + + if (!read_mode_) { + if (mode == "w") { + if (subset_first_row != 0) { + throw std::runtime_error("subset_first_row should be zero for write mode"); + } + if (subset_size != 0) { + throw std::runtime_error("subset_size should be zero for write mode"); + } + } else { + throw std::runtime_error("BinFile's mode must be either 'r' or 'w': " + file_); + } + } +} + +template +void BinFile::open_file_() const +{ + fp_ = fopen(file_.c_str(), read_mode_ ? "r" : "w"); + if (!fp_) { throw std::runtime_error("open BinFile failed: " + file_); } + + if (read_mode_) { + struct stat statbuf; + if (stat(file_.c_str(), &statbuf) != 0) { throw std::runtime_error("stat() failed: " + file_); } + file_size_ = statbuf.st_size; + + uint32_t header[2]; + if (fread(header, sizeof(uint32_t), 2, fp_) != 2) { + throw std::runtime_error("read header of BinFile failed: " + file_); + } + nrows_ = header[0]; + ndims_ = header[1]; + + size_t expected_file_size = + 2 * sizeof(uint32_t) + static_cast(nrows_) * ndims_ * sizeof(T); + if (file_size_ != expected_file_size) { + throw std::runtime_error("expected file size of " + file_ + " is " + + std::to_string(expected_file_size) + ", however, actual size is " + + std::to_string(file_size_)); + } + + if (subset_first_row_ >= nrows_) { + throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row_) + + ") >= nrows (" + std::to_string(nrows_) + ")"); + } + if (subset_first_row_ + subset_size_ > nrows_) { + throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row_) + + ") + subset_size (" + std::to_string(subset_size_) + ") > nrows (" + + std::to_string(nrows_) + ")"); + } + + if (subset_first_row_) { + static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset"); + if (fseek(fp_, sizeof(T) * subset_first_row_ * ndims_, SEEK_CUR) == -1) { + throw std::runtime_error(file_ + ": fseek failed"); + } + nrows_ -= subset_first_row_; + } + if (subset_size_) { nrows_ = subset_size_; } + } +} + +template +void BinFile::check_suffix_() +{ + auto pos = file_.rfind('.'); + if (pos == std::string::npos) { + throw std::runtime_error("name of BinFile doesn't have a suffix: " + file_); + } + std::string suffix = file_.substr(pos + 1); + + if constexpr (std::is_same_v) { + if (suffix != "fbin") { + throw std::runtime_error("BinFile should has .fbin suffix: " + file_); + } + } else if constexpr (std::is_same_v) { + if (suffix != "f16bin") { + throw std::runtime_error("BinFile should has .f16bin suffix: " + file_); + } + } else if constexpr (std::is_same_v) { + if (suffix != "ibin") { + throw std::runtime_error("BinFile should has .ibin suffix: " + file_); + } + } else if constexpr (std::is_same_v) { + if (suffix != "u8bin") { + throw std::runtime_error("BinFile should has .u8bin suffix: " + file_); + } + } else if constexpr (std::is_same_v) { + if (suffix != "i8bin") { + throw std::runtime_error("BinFile should has .i8bin suffix: " + file_); + } + } else { + throw std::runtime_error( + "T of BinFile should be one of float, half, int, uint8_t, or int8_t"); + } +} + +template +class Dataset { + public: + Dataset(const std::string& name) : name_(name) {} + Dataset(const std::string& name, const std::string& distance) : name_(name), distance_(distance) + { + } + Dataset(const Dataset&) = delete; + Dataset& operator=(const Dataset&) = delete; + virtual ~Dataset(); + + std::string name() const { return name_; } + std::string distance() const { return distance_; } + virtual int dim() const = 0; + virtual uint32_t max_k() const = 0; + virtual size_t base_set_size() const = 0; + virtual size_t query_set_size() const = 0; + + // load data lazily, so don't pay the overhead of reading unneeded set + // e.g. don't load base set when searching + const T* base_set() const + { + if (!base_set_) { load_base_set_(); } + return base_set_; + } + + const T* query_set() const + { + if (!query_set_) { load_query_set_(); } + return query_set_; + } + + const int32_t* gt_set() const + { + if (!gt_set_) { load_gt_set_(); } + return gt_set_; + } + + const T* base_set_on_gpu() const; + const T* query_set_on_gpu() const; + const T* mapped_base_set() const; + + auto query_set(MemoryType memory_type) const -> const T* + { + switch (memory_type) { + case MemoryType::Device: return query_set_on_gpu(); + default: return query_set(); + } + } + + auto base_set(MemoryType memory_type) const -> const T* + { + switch (memory_type) { + case MemoryType::Device: return base_set_on_gpu(); + case MemoryType::Host: return base_set(); + case MemoryType::HostMmap: return mapped_base_set(); + default: return nullptr; + } + } + + protected: + virtual void load_base_set_() const = 0; + virtual void load_gt_set_() const = 0; + virtual void load_query_set_() const = 0; + virtual void map_base_set_() const = 0; + + std::string name_; + std::string distance_; + + mutable T* base_set_ = nullptr; + mutable T* query_set_ = nullptr; + mutable T* d_base_set_ = nullptr; + mutable T* d_query_set_ = nullptr; + mutable T* mapped_base_set_ = nullptr; + mutable int32_t* gt_set_ = nullptr; +}; + +template +Dataset::~Dataset() +{ + delete[] base_set_; + delete[] query_set_; + delete[] gt_set_; +#ifndef BUILD_CPU_ONLY + if (d_base_set_) { cudaFree(d_base_set_); } + if (d_query_set_) { cudaFree(d_query_set_); } +#endif +} + +template +const T* Dataset::base_set_on_gpu() const +{ +#ifndef BUILD_CPU_ONLY + if (!d_base_set_) { + base_set(); + cudaMalloc((void**)&d_base_set_, base_set_size() * dim() * sizeof(T)); + cudaMemcpy(d_base_set_, base_set_, base_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice); + } +#endif + return d_base_set_; +} + +template +const T* Dataset::query_set_on_gpu() const +{ +#ifndef BUILD_CPU_ONLY + if (!d_query_set_) { + query_set(); + cudaMalloc((void**)&d_query_set_, query_set_size() * dim() * sizeof(T)); + cudaMemcpy( + d_query_set_, query_set_, query_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice); + } +#endif + return d_query_set_; +} + +template +const T* Dataset::mapped_base_set() const +{ + if (!mapped_base_set_) { map_base_set_(); } + return mapped_base_set_; +} + +template +class BinDataset : public Dataset { + public: + BinDataset(const std::string& name, + const std::string& base_file, + size_t subset_first_row, + size_t subset_size, + const std::string& query_file, + const std::string& distance, + const std::optional& groundtruth_neighbors_file); + + int dim() const override; + uint32_t max_k() const override; + size_t base_set_size() const override; + size_t query_set_size() const override; + + private: + void load_base_set_() const override; + void load_query_set_() const override; + void load_gt_set_() const override; + void map_base_set_() const override; + + mutable int dim_ = 0; + mutable uint32_t max_k_ = 0; + mutable size_t base_set_size_ = 0; + mutable size_t query_set_size_ = 0; + + BinFile base_file_; + BinFile query_file_; + std::optional> gt_file_{std::nullopt}; +}; + +template +BinDataset::BinDataset(const std::string& name, + const std::string& base_file, + size_t subset_first_row, + size_t subset_size, + const std::string& query_file, + const std::string& distance, + const std::optional& groundtruth_neighbors_file) + : Dataset(name, distance), + base_file_(base_file, "r", subset_first_row, subset_size), + query_file_(query_file, "r") +{ + if (groundtruth_neighbors_file.has_value()) { + gt_file_.emplace(groundtruth_neighbors_file.value(), "r"); + } +} + +template +int BinDataset::dim() const +{ + if (dim_ > 0) { return dim_; } + if (base_set_size() > 0) { return dim_; } + if (query_set_size() > 0) { return dim_; } + return dim_; +} + +template +uint32_t BinDataset::max_k() const +{ + if (!this->gt_set_) { load_gt_set_(); } + return max_k_; +} + +template +size_t BinDataset::query_set_size() const +{ + if (query_set_size_ > 0) { return query_set_size_; } + int dim; + query_file_.get_shape(&query_set_size_, &dim); + if (query_set_size_ == 0) { throw std::runtime_error("Zero query set size"); } + if (dim == 0) { throw std::runtime_error("Zero query set dim"); } + if (dim_ == 0) { + dim_ = dim; + } else if (dim_ != dim) { + throw std::runtime_error("base set dim (" + std::to_string(dim_) + ") != query set dim (" + + std::to_string(dim)); + } + return query_set_size_; +} + +template +size_t BinDataset::base_set_size() const +{ + if (base_set_size_ > 0) { return base_set_size_; } + int dim; + base_file_.get_shape(&base_set_size_, &dim); + if (base_set_size_ == 0) { throw std::runtime_error("Zero base set size"); } + if (dim == 0) { throw std::runtime_error("Zero base set dim"); } + if (dim_ == 0) { + dim_ = dim; + } else if (dim_ != dim) { + throw std::runtime_error("base set dim (" + std::to_string(dim) + ") != query set dim (" + + std::to_string(dim_)); + } + return base_set_size_; +} + +template +void BinDataset::load_base_set_() const +{ + this->base_set_ = new T[base_set_size() * dim()]; + base_file_.read(this->base_set_); +} + +template +void BinDataset::load_query_set_() const +{ + this->query_set_ = new T[query_set_size() * dim()]; + query_file_.read(this->query_set_); +} + +template +void BinDataset::load_gt_set_() const +{ + if (gt_file_.has_value()) { + size_t queries; + int k; + gt_file_->get_shape(&queries, &k); + this->gt_set_ = new std::int32_t[queries * k]; + gt_file_->read(this->gt_set_); + max_k_ = k; + } +} + +template +void BinDataset::map_base_set_() const +{ + this->mapped_base_set_ = base_file_.map(); +} + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp new file mode 100644 index 000000000..c01fa2c32 --- /dev/null +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +class FixedThreadPool { + public: + FixedThreadPool(int num_threads) + { + if (num_threads < 1) { + throw std::runtime_error("num_threads must >= 1"); + } else if (num_threads == 1) { + return; + } + + tasks_ = new Task_[num_threads]; + + threads_.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + threads_.emplace_back([&, i] { + auto& task = tasks_[i]; + while (true) { + std::unique_lock lock(task.mtx); + task.cv.wait(lock, + [&] { return task.has_task || finished_.load(std::memory_order_relaxed); }); + if (finished_.load(std::memory_order_relaxed)) { break; } + + task.task(); + task.has_task = false; + } + }); + } + } + + ~FixedThreadPool() + { + if (threads_.empty()) { return; } + + finished_.store(true, std::memory_order_relaxed); + for (unsigned i = 0; i < threads_.size(); ++i) { + auto& task = tasks_[i]; + std::lock_guard(task.mtx); + + task.cv.notify_one(); + threads_[i].join(); + } + + delete[] tasks_; + } + + template + void submit(Func f, IdxT len) + { + // Run functions in main thread if thread pool has no threads + if (threads_.empty()) { + for (IdxT i = 0; i < len; ++i) { + f(i); + } + return; + } + + const int num_threads = threads_.size(); + // one extra part for competition among threads + const IdxT items_per_thread = len / (num_threads + 1); + std::atomic cnt(items_per_thread * num_threads); + + // Wrap function + auto wrapped_f = [&](IdxT start, IdxT end) { + for (IdxT i = start; i < end; ++i) { + f(i); + } + + while (true) { + IdxT i = cnt.fetch_add(1, std::memory_order_relaxed); + if (i >= len) { break; } + f(i); + } + }; + + std::vector> futures; + futures.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + IdxT start = i * items_per_thread; + auto& task = tasks_[i]; + { + std::lock_guard lock(task.mtx); + (void)lock; // stop nvcc warning + task.task = std::packaged_task([=] { wrapped_f(start, start + items_per_thread); }); + futures.push_back(task.task.get_future()); + task.has_task = true; + } + task.cv.notify_one(); + } + + for (auto& fut : futures) { + fut.wait(); + } + return; + } + + private: + struct alignas(64) Task_ { + std::mutex mtx; + std::condition_variable cv; + bool has_task = false; + std::packaged_task task; + }; + + Task_* tasks_; + std::vector threads_; + std::atomic finished_{false}; +}; diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp new file mode 100644 index 000000000..e9e4a9ad2 --- /dev/null +++ b/cpp/bench/ann/src/common/util.hpp @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ann_types.hpp" +#include "cuda_stub.hpp" // cuda-related utils + +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace raft::bench::ann { + +template +struct buf { + MemoryType memory_type; + std::size_t size; + T* data; + buf(MemoryType memory_type, std::size_t size) + : memory_type(memory_type), size(size), data(nullptr) + { + switch (memory_type) { +#ifndef BUILD_CPU_ONLY + case MemoryType::Device: { + cudaMalloc(reinterpret_cast(&data), size * sizeof(T)); + cudaMemset(data, 0, size * sizeof(T)); + } break; +#endif + default: { + data = reinterpret_cast(malloc(size * sizeof(T))); + std::memset(data, 0, size * sizeof(T)); + } + } + } + ~buf() noexcept + { + if (data == nullptr) { return; } + switch (memory_type) { +#ifndef BUILD_CPU_ONLY + case MemoryType::Device: { + cudaFree(data); + } break; +#endif + default: { + free(data); + } + } + } + + [[nodiscard]] auto move(MemoryType target_memory_type) -> buf + { + buf r{target_memory_type, size}; +#ifndef BUILD_CPU_ONLY + if ((memory_type == MemoryType::Device && target_memory_type != MemoryType::Device) || + (memory_type != MemoryType::Device && target_memory_type == MemoryType::Device)) { + cudaMemcpy(r.data, data, size * sizeof(T), cudaMemcpyDefault); + return r; + } +#endif + std::swap(data, r.data); + return r; + } +}; + +struct cuda_timer { + private: + cudaStream_t stream_{nullptr}; + cudaEvent_t start_{nullptr}; + cudaEvent_t stop_{nullptr}; + double total_time_{0}; + + public: + struct cuda_lap { + private: + cudaStream_t stream_; + cudaEvent_t start_; + cudaEvent_t stop_; + double& total_time_; + + public: + cuda_lap(cudaStream_t stream, cudaEvent_t start, cudaEvent_t stop, double& total_time) + : start_(start), stop_(stop), stream_(stream), total_time_(total_time) + { +#ifndef BUILD_CPU_ONLY + cudaStreamSynchronize(stream_); + cudaEventRecord(start_, stream_); +#endif + } + cuda_lap() = delete; + + ~cuda_lap() noexcept + { +#ifndef BUILD_CPU_ONLY + cudaEventRecord(stop_, stream_); + cudaEventSynchronize(stop_); + float milliseconds = 0.0f; + cudaEventElapsedTime(&milliseconds, start_, stop_); + total_time_ += milliseconds / 1000.0; +#endif + } + }; + + cuda_timer() + { +#ifndef BUILD_CPU_ONLY + cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking); + cudaEventCreate(&stop_); + cudaEventCreate(&start_); +#endif + } + + ~cuda_timer() noexcept + { +#ifndef BUILD_CPU_ONLY + cudaEventDestroy(start_); + cudaEventDestroy(stop_); + cudaStreamDestroy(stream_); +#endif + } + + [[nodiscard]] auto stream() const -> cudaStream_t { return stream_; } + + [[nodiscard]] auto total_time() const -> double { return total_time_; } + + [[nodiscard]] auto lap() -> cuda_timer::cuda_lap + { + return cuda_lap{stream_, start_, stop_, total_time_}; + } +}; + +inline auto cuda_info() +{ + std::vector> props; +#ifndef BUILD_CPU_ONLY + int dev, driver = 0, runtime = 0; + cudaDriverGetVersion(&driver); + cudaRuntimeGetVersion(&runtime); + + cudaDeviceProp device_prop; + cudaGetDevice(&dev); + cudaGetDeviceProperties(&device_prop, dev); + props.emplace_back("gpu_name", std::string(device_prop.name)); + props.emplace_back("gpu_sm_count", std::to_string(device_prop.multiProcessorCount)); + props.emplace_back("gpu_sm_freq", std::to_string(device_prop.clockRate * 1e3)); + props.emplace_back("gpu_mem_freq", std::to_string(device_prop.memoryClockRate * 1e3)); + props.emplace_back("gpu_mem_bus_width", std::to_string(device_prop.memoryBusWidth)); + props.emplace_back("gpu_mem_global_size", std::to_string(device_prop.totalGlobalMem)); + props.emplace_back("gpu_mem_shared_size", std::to_string(device_prop.sharedMemPerMultiprocessor)); + props.emplace_back("gpu_driver_version", + std::to_string(driver / 1000) + "." + std::to_string((driver % 100) / 10)); + props.emplace_back("gpu_runtime_version", + std::to_string(runtime / 1000) + "." + std::to_string((runtime % 100) / 10)); +#endif + return props; +} + +struct nvtx_case { +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + private: + std::string case_name_; + std::array iter_name_{0}; + nvtxDomainHandle_t domain_; + int64_t iteration_ = 0; + nvtxEventAttributes_t case_attrib_{0}; + nvtxEventAttributes_t iter_attrib_{0}; +#endif + + public: + struct nvtx_lap { +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + private: + nvtxDomainHandle_t domain_; + + public: + nvtx_lap(nvtxDomainHandle_t domain, nvtxEventAttributes_t* attr) : domain_(domain) + { + nvtxDomainRangePushEx(domain_, attr); + } + nvtx_lap() = delete; + ~nvtx_lap() noexcept { nvtxDomainRangePop(domain_); } +#endif + }; + +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + explicit nvtx_case(std::string case_name) + : case_name_(std::move(case_name)), domain_(nvtxDomainCreateA("ANN benchmark")) + { + case_attrib_.version = NVTX_VERSION; + iter_attrib_.version = NVTX_VERSION; + case_attrib_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + iter_attrib_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + case_attrib_.colorType = NVTX_COLOR_ARGB; + iter_attrib_.colorType = NVTX_COLOR_ARGB; + case_attrib_.messageType = NVTX_MESSAGE_TYPE_ASCII; + iter_attrib_.messageType = NVTX_MESSAGE_TYPE_ASCII; + case_attrib_.message.ascii = case_name_.c_str(); + auto c = std::hash{}(case_name_); + case_attrib_.color = c | 0xA0A0A0; + nvtxDomainRangePushEx(domain_, &case_attrib_); + } + + ~nvtx_case() + { + nvtxDomainRangePop(domain_); + nvtxDomainDestroy(domain_); + } +#else + explicit nvtx_case(std::string) {} +#endif + + [[nodiscard]] auto lap() -> nvtx_case::nvtx_lap + { +#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND + auto i = iteration_++; + uint32_t c = (i % 5); + uint32_t r = 150 + c * 20; + uint32_t g = 200 + c * 10; + uint32_t b = 220 + c * 5; + std::snprintf(iter_name_.data(), iter_name_.size(), "Lap %zd", i); + iter_attrib_.message.ascii = iter_name_.data(); + iter_attrib_.color = (r << 16) + (g << 8) + b; + return nvtx_lap{domain_, &iter_attrib_}; +#else + return nvtx_lap{}; +#endif + } +}; + +inline std::vector split(const std::string& s, char delimiter) +{ + std::vector tokens; + std::string token; + std::istringstream iss(s); + while (getline(iss, token, delimiter)) { + if (!token.empty()) { tokens.push_back(token); } + } + return tokens; +} + +inline bool file_exists(const std::string& filename) +{ + struct stat statbuf; + if (stat(filename.c_str(), &statbuf) != 0) { return false; } + return S_ISREG(statbuf.st_mode); +} + +inline bool dir_exists(const std::string& dir) +{ + struct stat statbuf; + if (stat(dir.c_str(), &statbuf) != 0) { return false; } + return S_ISDIR(statbuf.st_mode); +} + +inline bool create_dir(const std::string& dir) +{ + const auto path = split(dir, '/'); + + std::string cwd; + if (!dir.empty() && dir[0] == '/') { cwd += '/'; } + + for (const auto& p : path) { + cwd += p + "/"; + if (!dir_exists(cwd)) { + int ret = mkdir(cwd.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + if (ret != 0) { return false; } + } + } + return true; +} + +inline void make_sure_parent_dir_exists(const std::string& file_path) +{ + const auto pos = file_path.rfind('/'); + if (pos != std::string::npos) { + auto dir = file_path.substr(0, pos); + if (!dir_exists(dir)) { create_dir(dir); } + } +} + +inline auto combine_path(const std::string& dir, const std::string& path) +{ + std::filesystem::path p_dir(dir); + std::filesystem::path p_suf(path); + return (p_dir / p_suf).string(); +} + +template +void log_(const char* level, const Ts&... vs) +{ + char buf[20]; + std::time_t now = std::time(nullptr); + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); + printf("%s [%s] ", buf, level); + if constexpr (sizeof...(Ts) == 1) { + printf("%s", vs...); + } else { + printf(vs...); + } + printf("\n"); + fflush(stdout); +} + +template +void log_info(Ts&&... vs) +{ + log_("info", std::forward(vs)...); +} + +template +void log_warn(Ts&&... vs) +{ + log_("warn", std::forward(vs)...); +} + +template +void log_error(Ts&&... vs) +{ + log_("error", std::forward(vs)...); +} + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp new file mode 100644 index 000000000..97d1bbf30 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "faiss_cpu_wrapper.h" +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_base_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpu::BuildParam& param) +{ + param.nlist = conf.at("nlist"); + if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpuIVFFlat::BuildParam& param) +{ + parse_base_build_param(conf, param); +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpuIVFPQ::BuildParam& param) +{ + parse_base_build_param(conf, param); + param.M = conf.at("M"); + if (conf.contains("usePrecomputed")) { + param.usePrecomputed = conf.at("usePrecomputed"); + } else { + param.usePrecomputed = false; + } + if (conf.contains("bitsPerCode")) { + param.bitsPerCode = conf.at("bitsPerCode"); + } else { + param.bitsPerCode = 8; + } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpuIVFSQ::BuildParam& param) +{ + parse_base_build_param(conf, param); + param.quantizer_type = conf.at("quantizer_type"); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpu::SearchParam& param) +{ + param.nprobe = conf.at("nprobe"); + if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); } + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + + (void)dev_list; + return std::make_unique>(metric, dim, param); +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + std::unique_ptr> ann; + + if constexpr (std::is_same_v) { + raft::bench::ann::Metric metric = parse_metric(distance); + if (algo == "faiss_cpu_ivf_flat") { + ann = make_algo(metric, dim, conf, dev_list); + } else if (algo == "faiss_cpu_ivf_pq") { + ann = make_algo(metric, dim, conf); + } else if (algo == "faiss_cpu_ivf_sq") { + ann = make_algo(metric, dim, conf); + } else if (algo == "faiss_cpu_flat") { + ann = std::make_unique>(metric, dim); + } + } + + if constexpr (std::is_same_v) {} + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "faiss_cpu_ivf_flat" || algo == "faiss_cpu_ivf_pq" || algo == "faiss_cpu_ivf_sq") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } else if (algo == "faiss_cpu_flat") { + auto param = std::make_unique::AnnSearchParam>(); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +} // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h new file mode 100644 index 000000000..755fe9f19 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/thread_pool.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric) +{ + if (metric == raft::bench::ann::Metric::kInnerProduct) { + return faiss::METRIC_INNER_PRODUCT; + } else if (metric == raft::bench::ann::Metric::kEuclidean) { + return faiss::METRIC_L2; + } else { + throw std::runtime_error("faiss supports only metric type of inner product and L2"); + } +} +} // namespace + +namespace raft::bench::ann { + +template +class FaissCpu : public ANN { + public: + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + int nprobe; + float refine_ratio = 1.0; + int num_threads = omp_get_num_procs(); + }; + + struct BuildParam { + int nlist = 1; + int ratio = 2; + }; + + FaissCpu(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), + metric_type_(parse_metric_type(metric)), + nlist_{param.nlist}, + training_sample_fraction_{1.0 / double(param.ratio)} + { + static_assert(std::is_same_v, "faiss support only float type"); + } + + virtual ~FaissCpu() noexcept {} + + void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final; + + void set_search_param(const AnnSearchParam& param) override; + + void init_quantizer(int dim) + { + if (this->metric_type_ == faiss::MetricType::METRIC_L2) { + this->quantizer_ = std::make_unique(dim); + } else if (this->metric_type_ == faiss::MetricType::METRIC_INNER_PRODUCT) { + this->quantizer_ = std::make_unique(dim); + } + } + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const final; + + AlgoProperty get_preference() const override + { + AlgoProperty property; + // to enable building big dataset which is larger than memory + property.dataset_memory_type = MemoryType::Host; + property.query_memory_type = MemoryType::Host; + return property; + } + + protected: + template + void save_(const std::string& file) const; + + template + void load_(const std::string& file); + + std::unique_ptr index_; + std::unique_ptr quantizer_; + std::unique_ptr index_refine_; + faiss::MetricType metric_type_; + int nlist_; + double training_sample_fraction_; + + int num_threads_; + std::unique_ptr thread_pool_; +}; + +template +void FaissCpu::build(const T* dataset, size_t nrow, cudaStream_t stream) +{ + auto index_ivf = dynamic_cast(index_.get()); + if (index_ivf != nullptr) { + // set the min/max training size for clustering to use the whole provided training set. + double trainset_size = training_sample_fraction_ * static_cast(nrow); + double points_per_centroid = trainset_size / static_cast(nlist_); + int max_ppc = std::ceil(points_per_centroid); + int min_ppc = std::floor(points_per_centroid); + if (min_ppc < index_ivf->cp.min_points_per_centroid) { + RAFT_LOG_WARN( + "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d " + "points per cluster (n_lists = %d). This is smaller than the FAISS default " + "min_points_per_centroid = %d.", + static_cast(trainset_size), + nrow, + training_sample_fraction_, + min_ppc, + nlist_, + index_ivf->cp.min_points_per_centroid); + } + index_ivf->cp.max_points_per_centroid = max_ppc; + index_ivf->cp.min_points_per_centroid = min_ppc; + } + index_->train(nrow, dataset); // faiss::IndexFlat::train() will do nothing + assert(index_->is_trained); + index_->add(nrow, dataset); + index_refine_ = std::make_unique(this->index_.get(), dataset); +} + +template +void FaissCpu::set_search_param(const AnnSearchParam& param) +{ + auto search_param = dynamic_cast(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + dynamic_cast(index_.get())->nprobe = nprobe; + + if (search_param.refine_ratio > 1.0) { + this->index_refine_.get()->k_factor = search_param.refine_ratio; + } + + if (!thread_pool_ || num_threads_ != search_param.num_threads) { + num_threads_ = search_param.num_threads; + thread_pool_ = std::make_unique(num_threads_); + } +} + +template +void FaissCpu::search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream) const +{ + static_assert(sizeof(size_t) == sizeof(faiss::idx_t), + "sizes of size_t and faiss::idx_t are different"); + + thread_pool_->submit( + [&](int i) { + // Use thread pool for batch size = 1. FAISS multi-threads internally for batch size > 1. + index_->search(batch_size, queries, k, distances, reinterpret_cast(neighbors)); + }, + 1); +} + +template +template +void FaissCpu::save_(const std::string& file) const +{ + faiss::write_index(index_.get(), file.c_str()); +} + +template +template +void FaissCpu::load_(const std::string& file) +{ + index_ = std::unique_ptr(dynamic_cast(faiss::read_index(file.c_str()))); +} + +template +class FaissCpuIVFFlat : public FaissCpu { + public: + using typename FaissCpu::BuildParam; + + FaissCpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) + { + this->init_quantizer(dim); + this->index_ = std::make_unique( + this->quantizer_.get(), dim, param.nlist, this->metric_type_); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } +}; + +template +class FaissCpuIVFPQ : public FaissCpu { + public: + struct BuildParam : public FaissCpu::BuildParam { + int M; + int bitsPerCode; + bool usePrecomputed; + }; + + FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) + { + this->init_quantizer(dim); + this->index_ = std::make_unique( + this->quantizer_.get(), dim, param.nlist, param.M, param.bitsPerCode, this->metric_type_); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } +}; + +// TODO: Enable this in cmake +// ref: https://github.com/rapidsai/raft/issues/1876 +template +class FaissCpuIVFSQ : public FaissCpu { + public: + struct BuildParam : public FaissCpu::BuildParam { + std::string quantizer_type; + }; + + FaissCpuIVFSQ(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) + { + faiss::ScalarQuantizer::QuantizerType qtype; + if (param.quantizer_type == "fp16") { + qtype = faiss::ScalarQuantizer::QT_fp16; + } else if (param.quantizer_type == "int8") { + qtype = faiss::ScalarQuantizer::QT_8bit; + } else { + throw std::runtime_error("FaissCpuIVFSQ supports only fp16 and int8 but got " + + param.quantizer_type); + } + + this->init_quantizer(dim); + this->index_ = std::make_unique( + this->quantizer_.get(), dim, param.nlist, qtype, this->metric_type_, true); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } +}; + +template +class FaissCpuFlat : public FaissCpu { + public: + FaissCpuFlat(Metric metric, int dim) + : FaissCpu(metric, dim, typename FaissCpu::BuildParam{}) + { + this->index_ = std::make_unique(dim, this->metric_type_); + } + + // class FaissCpu is more like a IVF class, so need special treating here + void set_search_param(const typename ANN::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) { + this->num_threads_ = search_param.num_threads; + this->thread_pool_ = std::make_unique(this->num_threads_); + } + }; + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } +}; + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu new file mode 100644 index 000000000..8b04ba198 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#undef WARP_SIZE +#include "faiss_gpu_wrapper.h" +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_base_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissGpu::BuildParam& param) +{ + param.nlist = conf.at("nlist"); + if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissGpuIVFFlat::BuildParam& param) +{ + parse_base_build_param(conf, param); +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissGpuIVFPQ::BuildParam& param) +{ + parse_base_build_param(conf, param); + param.M = conf.at("M"); + if (conf.contains("usePrecomputed")) { + param.usePrecomputed = conf.at("usePrecomputed"); + } else { + param.usePrecomputed = false; + } + if (conf.contains("useFloat16")) { + param.useFloat16 = conf.at("useFloat16"); + } else { + param.useFloat16 = false; + } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissGpuIVFSQ::BuildParam& param) +{ + parse_base_build_param(conf, param); + param.quantizer_type = conf.at("quantizer_type"); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissGpu::SearchParam& param) +{ + param.nprobe = conf.at("nprobe"); + if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); } +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + + (void)dev_list; + return std::make_unique>(metric, dim, param); +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + std::unique_ptr> ann; + + if constexpr (std::is_same_v) { + raft::bench::ann::Metric metric = parse_metric(distance); + if (algo == "faiss_gpu_ivf_flat") { + ann = make_algo(metric, dim, conf, dev_list); + } else if (algo == "faiss_gpu_ivf_pq") { + ann = make_algo(metric, dim, conf); + } else if (algo == "faiss_gpu_ivf_sq") { + ann = make_algo(metric, dim, conf); + } else if (algo == "faiss_gpu_flat") { + ann = std::make_unique>(metric, dim); + } + } + + if constexpr (std::is_same_v) {} + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } else if (algo == "faiss_gpu_flat") { + auto param = std::make_unique::AnnSearchParam>(); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +} // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h new file mode 100644 index 000000000..4f13ff8a4 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef FAISS_WRAPPER_H_ +#define FAISS_WRAPPER_H_ + +#include "../common/ann_types.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric) +{ + if (metric == raft::bench::ann::Metric::kInnerProduct) { + return faiss::METRIC_INNER_PRODUCT; + } else if (metric == raft::bench::ann::Metric::kEuclidean) { + return faiss::METRIC_L2; + } else { + throw std::runtime_error("faiss supports only metric type of inner product and L2"); + } +} + +// note BLAS library can still use multi-threading, and +// setting environment variable like OPENBLAS_NUM_THREADS can control it +class OmpSingleThreadScope { + public: + OmpSingleThreadScope() + { + max_threads_ = omp_get_max_threads(); + omp_set_num_threads(1); + } + ~OmpSingleThreadScope() + { + // the best we can do + omp_set_num_threads(max_threads_); + } + + private: + int max_threads_; +}; + +} // namespace + +namespace raft::bench::ann { + +template +class FaissGpu : public ANN { + public: + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + int nprobe; + float refine_ratio = 1.0; + auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; } + }; + + struct BuildParam { + int nlist = 1; + int ratio = 2; + }; + + FaissGpu(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), + metric_type_(parse_metric_type(metric)), + nlist_{param.nlist}, + training_sample_fraction_{1.0 / double(param.ratio)} + { + static_assert(std::is_same_v, "faiss support only float type"); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); + RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); + faiss_default_stream_ = gpu_resource_.getDefaultStream(device_); + raft::resource::set_cuda_stream(handle_, faiss_default_stream_); + } + + virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); } + + void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final; + + virtual void set_search_param(const FaissGpu::AnnSearchParam& param) {} + + void set_search_dataset(const T* dataset, size_t nrow) override { dataset_ = dataset; } + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const final; + + AlgoProperty get_preference() const override + { + AlgoProperty property; + // to enable building big dataset which is larger than GPU memory + property.dataset_memory_type = MemoryType::Host; + property.query_memory_type = MemoryType::Host; + return property; + } + + protected: + template + void save_(const std::string& file) const; + + template + void load_(const std::string& file); + + void stream_wait(cudaStream_t stream) const + { + RAFT_CUDA_TRY(cudaEventRecord(sync_, faiss_default_stream_)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_)); + } + + mutable faiss::gpu::StandardGpuResources gpu_resource_; + std::unique_ptr index_; + std::unique_ptr index_refine_{nullptr}; + faiss::MetricType metric_type_; + int nlist_; + int device_; + cudaEvent_t sync_{nullptr}; + cudaStream_t faiss_default_stream_{nullptr}; + double training_sample_fraction_; + std::unique_ptr search_params_; + const T* dataset_; + raft::device_resources handle_; + float refine_ratio_ = 1.0; +}; + +template +void FaissGpu::build(const T* dataset, size_t nrow, cudaStream_t stream) +{ + OmpSingleThreadScope omp_single_thread; + auto index_ivf = dynamic_cast(index_.get()); + if (index_ivf != nullptr) { + // set the min/max training size for clustering to use the whole provided training set. + double trainset_size = training_sample_fraction_ * static_cast(nrow); + double points_per_centroid = trainset_size / static_cast(nlist_); + int max_ppc = std::ceil(points_per_centroid); + int min_ppc = std::floor(points_per_centroid); + if (min_ppc < index_ivf->cp.min_points_per_centroid) { + RAFT_LOG_WARN( + "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d " + "points per cluster (n_lists = %d). This is smaller than the FAISS default " + "min_points_per_centroid = %d.", + static_cast(trainset_size), + nrow, + training_sample_fraction_, + min_ppc, + nlist_, + index_ivf->cp.min_points_per_centroid); + } + index_ivf->cp.max_points_per_centroid = max_ppc; + index_ivf->cp.min_points_per_centroid = min_ppc; + } + index_->train(nrow, dataset); // faiss::gpu::GpuIndexFlat::train() will do nothing + assert(index_->is_trained); + index_->add(nrow, dataset); + stream_wait(stream); +} + +template +void FaissGpu::search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream) const +{ + static_assert(sizeof(size_t) == sizeof(faiss::idx_t), + "sizes of size_t and faiss::idx_t are different"); + + if (this->refine_ratio_ > 1.0) { + // TODO: FAISS changed their search APIs to accept the search parameters as a struct object + // but their refine API doesn't allow the struct to be passed in. Once this is fixed, we + // need to re-enable refinement below + // index_refine_->search(batch_size, queries, k, distances, + // reinterpret_cast(neighbors), this->search_params_.get()); Related FAISS issue: + // https://github.com/facebookresearch/faiss/issues/3118 + throw std::runtime_error( + "FAISS doesn't support refinement in their new APIs so this feature is disabled in the " + "benchmarks for the time being."); + } else { + index_->search(batch_size, + queries, + k, + distances, + reinterpret_cast(neighbors), + this->search_params_.get()); + } + stream_wait(stream); +} + +template +template +void FaissGpu::save_(const std::string& file) const +{ + OmpSingleThreadScope omp_single_thread; + + auto cpu_index = std::make_unique(); + dynamic_cast(index_.get())->copyTo(cpu_index.get()); + faiss::write_index(cpu_index.get(), file.c_str()); +} + +template +template +void FaissGpu::load_(const std::string& file) +{ + OmpSingleThreadScope omp_single_thread; + + std::unique_ptr cpu_index(dynamic_cast(faiss::read_index(file.c_str()))); + assert(cpu_index); + + try { + dynamic_cast(index_.get())->copyFrom(cpu_index.get()); + + } catch (const std::exception& e) { + std::cout << "Error loading index file: " << std::string(e.what()) << std::endl; + } +} + +template +class FaissGpuIVFFlat : public FaissGpu { + public: + using typename FaissGpu::BuildParam; + + FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) + { + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = this->device_; + this->index_ = std::make_unique( + &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config); + } + + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + + faiss::IVFSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + this->search_params_ = std::make_unique(faiss_search_params); + this->refine_ratio_ = search_param.refine_ratio; + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } +}; + +template +class FaissGpuIVFPQ : public FaissGpu { + public: + struct BuildParam : public FaissGpu::BuildParam { + int M; + bool useFloat16; + bool usePrecomputed; + }; + + FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) + { + faiss::gpu::GpuIndexIVFPQConfig config; + config.useFloat16LookupTables = param.useFloat16; + config.usePrecomputedTables = param.usePrecomputed; + config.device = this->device_; + + this->index_ = + std::make_unique(&(this->gpu_resource_), + dim, + param.nlist, + param.M, + 8, // FAISS only supports bitsPerCode=8 + this->metric_type_, + config); + } + + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + this->refine_ratio_ = search_param.refine_ratio; + faiss::IVFPQSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + + this->search_params_ = std::make_unique(faiss_search_params); + + if (search_param.refine_ratio > 1.0) { + this->index_refine_ = + std::make_unique(this->index_.get(), this->dataset_); + this->index_refine_.get()->k_factor = search_param.refine_ratio; + } + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } +}; + +// TODO: Enable this in cmake +// ref: https://github.com/rapidsai/raft/issues/1876 +template +class FaissGpuIVFSQ : public FaissGpu { + public: + struct BuildParam : public FaissGpu::BuildParam { + std::string quantizer_type; + }; + + FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) + { + faiss::ScalarQuantizer::QuantizerType qtype; + if (param.quantizer_type == "fp16") { + qtype = faiss::ScalarQuantizer::QT_fp16; + } else if (param.quantizer_type == "int8") { + qtype = faiss::ScalarQuantizer::QT_8bit; + } else { + throw std::runtime_error("FaissGpuIVFSQ supports only fp16 and int8 but got " + + param.quantizer_type); + } + + faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; + config.device = this->device_; + this->index_ = std::make_unique( + &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config); + } + + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + + faiss::IVFSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + + this->search_params_ = std::make_unique(faiss_search_params); + this->refine_ratio_ = search_param.refine_ratio; + if (search_param.refine_ratio > 1.0) { + this->index_refine_ = + std::make_unique(this->index_.get(), this->dataset_); + this->index_refine_.get()->k_factor = search_param.refine_ratio; + } + } + + void save(const std::string& file) const override + { + this->template save_( + file); + } + void load(const std::string& file) override + { + this->template load_( + file); + } +}; + +template +class FaissGpuFlat : public FaissGpu { + public: + FaissGpuFlat(Metric metric, int dim) + : FaissGpu(metric, dim, typename FaissGpu::BuildParam{}) + { + faiss::gpu::GpuIndexFlatConfig config; + config.device = this->device_; + this->index_ = std::make_unique( + &(this->gpu_resource_), dim, this->metric_type_, config); + } + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + + this->search_params_ = std::make_unique(); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } +}; + +} // namespace raft::bench::ann + +#endif \ No newline at end of file diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu new file mode 100644 index 000000000..3b2e97062 --- /dev/null +++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "ggnn_wrapper.cuh" +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::Ggnn::BuildParam& param) +{ + param.k = conf.at("k"); + + if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); } + if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); } + if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); } + if (conf.contains("tau")) { param.tau = conf.at("tau"); } + if (conf.contains("refine_iterations")) { + param.refine_iterations = conf.at("refine_iterations"); + } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::Ggnn::SearchParam& param) +{ + param.tau = conf.at("tau"); + + if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); } + if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); } + if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); } + if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); } +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + + (void)dev_list; + return std::make_unique>(metric, dim, param); +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + raft::bench::ann::Metric metric = parse_metric(distance); + std::unique_ptr> ann; + + if constexpr (std::is_same_v) {} + + if constexpr (std::is_same_v) {} + + if (algo == "ggnn") { ann = make_algo(metric, dim, conf); } + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "ggnn") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +} // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh new file mode 100644 index 000000000..664ec511d --- /dev/null +++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../common/ann_types.hpp" + +#include +#include + +#include +#include + +namespace raft::bench::ann { + +template +class GgnnImpl; + +template +class Ggnn : public ANN { + public: + struct BuildParam { + int k_build{24}; // KBuild + int segment_size{32}; // S + int num_layers{4}; // L + float tau{0.5}; + int refine_iterations{2}; + int k; // GGNN requires to know k during building + }; + + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + float tau; + int block_dim{32}; + int max_iterations{400}; + int cache_size{512}; + int sorted_size{256}; + auto needs_dataset() const -> bool override { return true; } + }; + + Ggnn(Metric metric, int dim, const BuildParam& param); + ~Ggnn() { delete impl_; } + + void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override + { + impl_->build(dataset, nrow, stream); + } + + void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); } + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override + { + impl_->search(queries, batch_size, k, neighbors, distances, stream); + } + + void save(const std::string& file) const override { impl_->save(file); } + void load(const std::string& file) override { impl_->load(file); } + + AlgoProperty get_preference() const override { return impl_->get_preference(); } + + void set_search_dataset(const T* dataset, size_t nrow) override + { + impl_->set_search_dataset(dataset, nrow); + }; + + private: + ANN* impl_; +}; + +template +Ggnn::Ggnn(Metric metric, int dim, const BuildParam& param) : ANN(metric, dim) +{ + // ggnn/src/sift1m.cu + if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 && + param.segment_size == 32) { + impl_ = new GgnnImpl(metric, dim, param); + } + // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B + else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 && + param.segment_size == 32) { + impl_ = new GgnnImpl(metric, dim, param); + } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 && + param.segment_size == 32) { + impl_ = new GgnnImpl(metric, dim, param); + } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 && + param.segment_size == 64) { + impl_ = new GgnnImpl(metric, dim, param); + } + // ggnn/src/glove200.cu, adapt it to glove100 + else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 && + param.segment_size == 64) { + impl_ = new GgnnImpl(metric, dim, param); + } else { + throw std::runtime_error( + "ggnn: not supported combination of metric, dim and build param; " + "see Ggnn's constructor in ggnn_wrapper.cuh for available combinations"); + } +} + +template +class GgnnImpl : public ANN { + public: + using typename ANN::AnnSearchParam; + + GgnnImpl(Metric metric, int dim, const typename Ggnn::BuildParam& param); + + void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override; + + void set_search_param(const AnnSearchParam& param) override; + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + void save(const std::string& file) const override; + void load(const std::string& file) override; + + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::Device; + property.query_memory_type = MemoryType::Device; + return property; + } + + void set_search_dataset(const T* dataset, size_t nrow) override; + + private: + using ANN::metric_; + using ANN::dim_; + + using GGNNGPUInstance = GGNNGPUInstance; + std::unique_ptr ggnn_; + typename Ggnn::BuildParam build_param_; + typename Ggnn::SearchParam search_param_; +}; + +template +GgnnImpl::GgnnImpl(Metric metric, + int dim, + const typename Ggnn::BuildParam& param) + : ANN(metric, dim), build_param_(param) +{ + if (metric_ == Metric::kInnerProduct) { + if (measure != Cosine) { throw std::runtime_error("mis-matched metric"); } + } else if (metric_ == Metric::kEuclidean) { + if (measure != Euclidean) { throw std::runtime_error("mis-matched metric"); } + } else { + throw std::runtime_error( + "ggnn supports only metric type of InnerProduct, Cosine and Euclidean"); + } + + if (dim != D) { throw std::runtime_error("mis-matched dim"); } +} + +template +void GgnnImpl::build(const T* dataset, + size_t nrow, + cudaStream_t stream) +{ + int device; + RAFT_CUDA_TRY(cudaGetDevice(&device)); + ggnn_ = std::make_unique( + device, nrow, build_param_.num_layers, true, build_param_.tau); + + ggnn_->set_base_data(dataset); + ggnn_->set_stream(stream); + ggnn_->build(0); + for (int i = 0; i < build_param_.refine_iterations; ++i) { + ggnn_->refine(); + } +} + +template +void GgnnImpl::set_search_dataset(const T* dataset, size_t nrow) +{ + ggnn_->set_base_data(dataset); +} + +template +void GgnnImpl::set_search_param(const AnnSearchParam& param) +{ + search_param_ = dynamic_cast::SearchParam&>(param); +} + +template +void GgnnImpl::search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream) const +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different"); + if (k != KQuery) { + throw std::runtime_error( + "k = " + std::to_string(k) + + ", but this GGNN instance only supports k = " + std::to_string(KQuery)); + } + + ggnn_->set_stream(stream); + RAFT_CUDA_TRY(cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float))); + + const int block_dim = search_param_.block_dim; + const int max_iterations = search_param_.max_iterations; + const int cache_size = search_param_.cache_size; + const int sorted_size = search_param_.sorted_size; + // default value + if (block_dim == 32 && max_iterations == 400 && cache_size == 512 && sorted_size == 256) { + ggnn_->template queryLayer<32, 400, 512, 256, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // ggnn/src/sift1m.cu + else if (block_dim == 32 && max_iterations == 200 && cache_size == 256 && sorted_size == 64) { + ggnn_->template queryLayer<32, 200, 256, 64, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // ggnn/src/sift1m.cu + else if (block_dim == 32 && max_iterations == 400 && cache_size == 448 && sorted_size == 64) { + ggnn_->template queryLayer<32, 400, 448, 64, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // ggnn/src/glove200.cu + else if (block_dim == 128 && max_iterations == 2000 && cache_size == 2048 && sorted_size == 32) { + ggnn_->template queryLayer<128, 2000, 2048, 32, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } + // for glove100 + else if (block_dim == 64 && max_iterations == 400 && cache_size == 512 && sorted_size == 32) { + ggnn_->template queryLayer<64, 400, 512, 32, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } else if (block_dim == 128 && max_iterations == 2000 && cache_size == 1024 && + sorted_size == 32) { + ggnn_->template queryLayer<128, 2000, 1024, 32, false>( + queries, batch_size, reinterpret_cast(neighbors), distances); + } else { + throw std::runtime_error("ggnn: not supported search param"); + } +} + +template +void GgnnImpl::save(const std::string& file) const +{ + auto& ggnn_host = ggnn_->ggnn_cpu_buffers.at(0); + auto& ggnn_device = ggnn_->ggnn_shards.at(0); + ggnn_->set_stream(0); + + ggnn_host.downloadAsync(ggnn_device); + RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream)); + ggnn_host.store(file); +} + +template +void GgnnImpl::load(const std::string& file) +{ + auto& ggnn_host = ggnn_->ggnn_cpu_buffers.at(0); + auto& ggnn_device = ggnn_->ggnn_shards.at(0); + ggnn_->set_stream(0); + + ggnn_host.load(file); + ggnn_host.uploadAsync(ggnn_device); + RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream)); +} + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp new file mode 100644 index 000000000..1af19a22c --- /dev/null +++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "hnswlib_wrapper.h" +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::HnswLib::BuildParam& param) +{ + param.ef_construction = conf.at("efConstruction"); + param.M = conf.at("M"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::HnswLib::SearchParam& param) +{ + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + + (void)dev_list; + return std::make_unique>(metric, dim, param); +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + raft::bench::ann::Metric metric = parse_metric(distance); + std::unique_ptr> ann; + + if constexpr (std::is_same_v) { + if (algo == "hnswlib") { ann = make_algo(metric, dim, conf); } + } + + if constexpr (std::is_same_v) { + if (algo == "hnswlib") { ann = make_algo(metric, dim, conf); } + } + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "hnswlib") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +}; // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h new file mode 100644 index 000000000..921d72dec --- /dev/null +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "../common/thread_pool.hpp" +#include + +namespace raft::bench::ann { + +template +struct hnsw_dist_t { + using type = void; +}; + +template <> +struct hnsw_dist_t { + using type = float; +}; + +template <> +struct hnsw_dist_t { + using type = int; +}; + +template +class HnswLib : public ANN { + public: + // https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md + struct BuildParam { + int M; + int ef_construction; + int num_threads = omp_get_num_procs(); + }; + + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + int ef; + int num_threads = 1; + }; + + HnswLib(Metric metric, int dim, const BuildParam& param); + + void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override; + + void set_search_param(const AnnSearchParam& param) override; + void search(const T* query, + int batch_size, + int k, + size_t* indices, + float* distances, + cudaStream_t stream = 0) const override; + + void save(const std::string& path_to_index) const override; + void load(const std::string& path_to_index) override; + + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::Host; + property.query_memory_type = MemoryType::Host; + return property; + } + + void set_base_layer_only() { appr_alg_->base_layer_only = true; } + + private: + void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const; + + std::unique_ptr::type>> appr_alg_; + std::unique_ptr::type>> space_; + + using ANN::metric_; + using ANN::dim_; + int ef_construction_; + int m_; + int num_threads_; + std::unique_ptr thread_pool_; + Objective metric_objective_; +}; + +template +HnswLib::HnswLib(Metric metric, int dim, const BuildParam& param) : ANN(metric, dim) +{ + assert(dim_ > 0); + static_assert(std::is_same_v || std::is_same_v); + if constexpr (std::is_same_v) { + if (metric_ != Metric::kEuclidean) { + throw std::runtime_error("hnswlib only supports Euclidean distance"); + } + } + + ef_construction_ = param.ef_construction; + m_ = param.M; + num_threads_ = param.num_threads; +} + +template +void HnswLib::build(const T* dataset, size_t nrow, cudaStream_t) +{ + if constexpr (std::is_same_v) { + if (metric_ == Metric::kInnerProduct) { + space_ = std::make_unique(dim_); + } else { + space_ = std::make_unique(dim_); + } + } else if constexpr (std::is_same_v) { + space_ = std::make_unique(dim_); + } + + appr_alg_ = std::make_unique::type>>( + space_.get(), nrow, m_, ef_construction_); + + thread_pool_ = std::make_unique(num_threads_); + const size_t items_per_thread = nrow / (num_threads_ + 1); + + thread_pool_->submit( + [&](size_t i) { + if (i < items_per_thread && i % 10000 == 0) { + char buf[20]; + std::time_t now = std::time(nullptr); + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); + printf("%s building %zu / %zu\n", buf, i, items_per_thread); + fflush(stdout); + } + + appr_alg_->addPoint(dataset + i * dim_, i); + }, + nrow); +} + +template +void HnswLib::set_search_param(const AnnSearchParam& param_) +{ + auto param = dynamic_cast(param_); + appr_alg_->ef_ = param.ef; + metric_objective_ = param.metric_objective; + num_threads_ = param.num_threads; + + // Create a pool if multiple query threads have been set and the pool hasn't been created already + bool create_pool = (metric_objective_ == Objective::LATENCY && num_threads_ > 1 && !thread_pool_); + if (create_pool) { thread_pool_ = std::make_unique(num_threads_); } +} + +template +void HnswLib::search( + const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const +{ + auto f = [&](int i) { + // hnsw can only handle a single vector at a time. + get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k); + }; + if (metric_objective_ == Objective::LATENCY && num_threads_ > 1) { + thread_pool_->submit(f, batch_size); + } else { + for (int i = 0; i < batch_size; i++) { + f(i); + } + } +} + +template +void HnswLib::save(const std::string& path_to_index) const +{ + appr_alg_->saveIndex(std::string(path_to_index)); +} + +template +void HnswLib::load(const std::string& path_to_index) +{ + if constexpr (std::is_same_v) { + if (metric_ == Metric::kInnerProduct) { + space_ = std::make_unique(dim_); + } else { + space_ = std::make_unique(dim_); + } + } else if constexpr (std::is_same_v) { + space_ = std::make_unique(dim_); + } + + appr_alg_ = std::make_unique::type>>( + space_.get(), path_to_index); +} + +template +void HnswLib::get_search_knn_results_(const T* query, + int k, + size_t* indices, + float* distances) const +{ + auto result = appr_alg_->searchKnn(query, k); + assert(result.size() >= static_cast(k)); + + for (int i = k - 1; i >= 0; --i) { + indices[i] = result.top().second; + distances[i] = result.top().first; + result.pop(); + } +} + +}; // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h new file mode 100644 index 000000000..1eb0e53cc --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define JSON_DIAGNOSTICS 1 +#include + +#undef WARP_SIZE +#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN +#include "raft_wrapper.h" +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT +#include "raft_ivf_flat_wrapper.h" +extern template class raft::bench::ann::RaftIvfFlatGpu; +extern template class raft::bench::ann::RaftIvfFlatGpu; +extern template class raft::bench::ann::RaftIvfFlatGpu; +#endif +#if defined(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || \ + defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +#include "raft_ivf_pq_wrapper.h" +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ +extern template class raft::bench::ann::RaftIvfPQ; +extern template class raft::bench::ann::RaftIvfPQ; +extern template class raft::bench::ann::RaftIvfPQ; +#endif +#if defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +#include "raft_cagra_wrapper.h" +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA +extern template class raft::bench::ann::RaftCagra; +extern template class raft::bench::ann::RaftCagra; +extern template class raft::bench::ann::RaftCagra; +#endif + +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfFlatGpu::BuildParam& param) +{ + param.n_lists = conf.at("nlist"); + if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfFlatGpu::SearchParam& param) +{ + param.ivf_flat_params.n_probes = conf.at("nprobe"); +} +#endif + +#if defined(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || \ + defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfPQ::BuildParam& param) +{ + if (conf.contains("nlist")) { param.n_lists = conf.at("nlist"); } + if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } + if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); } + if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); } + if (conf.contains("codebook_kind")) { + std::string kind = conf.at("codebook_kind"); + if (kind == "cluster") { + param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER; + } else if (kind == "subspace") { + param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE; + } else { + throw std::runtime_error("codebook_kind: '" + kind + + "', should be either 'cluster' or 'subspace'"); + } + } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfPQ::SearchParam& param) +{ + if (conf.contains("nprobe")) { param.pq_param.n_probes = conf.at("nprobe"); } + if (conf.contains("internalDistanceDtype")) { + std::string type = conf.at("internalDistanceDtype"); + if (type == "float") { + param.pq_param.internal_distance_dtype = CUDA_R_32F; + } else if (type == "half") { + param.pq_param.internal_distance_dtype = CUDA_R_16F; + } else { + throw std::runtime_error("internalDistanceDtype: '" + type + + "', should be either 'float' or 'half'"); + } + } else { + // set half as default type + param.pq_param.internal_distance_dtype = CUDA_R_16F; + } + + if (conf.contains("smemLutDtype")) { + std::string type = conf.at("smemLutDtype"); + if (type == "float") { + param.pq_param.lut_dtype = CUDA_R_32F; + } else if (type == "half") { + param.pq_param.lut_dtype = CUDA_R_16F; + } else if (type == "fp8") { + param.pq_param.lut_dtype = CUDA_R_8U; + } else { + throw std::runtime_error("smemLutDtype: '" + type + + "', should be either 'float', 'half' or 'fp8'"); + } + } else { + // set half as default + param.pq_param.lut_dtype = CUDA_R_16F; + } + if (conf.contains("refine_ratio")) { + param.refine_ratio = conf.at("refine_ratio"); + if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); } + } +} +#endif + +#if defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +template +void parse_build_param(const nlohmann::json& conf, + raft::neighbors::experimental::nn_descent::index_params& param) +{ + if (conf.contains("graph_degree")) { param.graph_degree = conf.at("graph_degree"); } + if (conf.contains("intermediate_graph_degree")) { + param.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } + // we allow niter shorthand for max_iterations + if (conf.contains("niter")) { param.max_iterations = conf.at("niter"); } + if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); } + if (conf.contains("termination_threshold")) { + param.termination_threshold = conf.at("termination_threshold"); + } +} + +nlohmann::json collect_conf_with_prefix(const nlohmann::json& conf, + const std::string& prefix, + bool remove_prefix = true) +{ + nlohmann::json out; + for (auto& i : conf.items()) { + if (i.key().compare(0, prefix.size(), prefix) == 0) { + auto new_key = remove_prefix ? i.key().substr(prefix.size()) : i.key(); + out[new_key] = i.value(); + } + } + return out; +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagra::BuildParam& param) +{ + if (conf.contains("graph_degree")) { + param.cagra_params.graph_degree = conf.at("graph_degree"); + param.cagra_params.intermediate_graph_degree = param.cagra_params.graph_degree * 2; + } + if (conf.contains("intermediate_graph_degree")) { + param.cagra_params.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } + if (conf.contains("graph_build_algo")) { + if (conf.at("graph_build_algo") == "IVF_PQ") { + param.cagra_params.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ; + } else if (conf.at("graph_build_algo") == "NN_DESCENT") { + param.cagra_params.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT; + } + } + nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); + if (!ivf_pq_build_conf.empty()) { + raft::neighbors::ivf_pq::index_params bparam; + parse_build_param(ivf_pq_build_conf, bparam); + param.ivf_pq_build_params = bparam; + } + nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); + if (!ivf_pq_search_conf.empty()) { + typename raft::bench::ann::RaftIvfPQ::SearchParam sparam; + parse_search_param(ivf_pq_search_conf, sparam); + param.ivf_pq_search_params = sparam.pq_param; + param.ivf_pq_refine_rate = sparam.refine_ratio; + } + nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); + if (!nn_descent_conf.empty()) { + raft::neighbors::experimental::nn_descent::index_params nn_param; + nn_param.intermediate_graph_degree = 1.5 * param.cagra_params.intermediate_graph_degree; + parse_build_param(nn_descent_conf, nn_param); + if (nn_param.graph_degree != param.cagra_params.intermediate_graph_degree) { + nn_param.graph_degree = param.cagra_params.intermediate_graph_degree; + } + param.nn_descent_params = nn_param; + } +} + +raft::bench::ann::AllocatorType parse_allocator(std::string mem_type) +{ + if (mem_type == "device") { + return raft::bench::ann::AllocatorType::Device; + } else if (mem_type == "host_pinned") { + return raft::bench::ann::AllocatorType::HostPinned; + } else if (mem_type == "host_huge_page") { + return raft::bench::ann::AllocatorType::HostHugePage; + } + THROW( + "Invalid value for memory type %s, must be one of [\"device\", \"host_pinned\", " + "\"host_huge_page\"", + mem_type.c_str()); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagra::SearchParam& param) +{ + if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); } + if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); } + if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); } + if (conf.contains("algo")) { + if (conf.at("algo") == "single_cta") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA; + } else if (conf.at("algo") == "multi_cta") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_CTA; + } else if (conf.at("algo") == "multi_kernel") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_KERNEL; + } else if (conf.at("algo") == "auto") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::AUTO; + } else { + std::string tmp = conf.at("algo"); + THROW("Invalid value for algo: %s", tmp.c_str()); + } + } + if (conf.contains("graph_memory_type")) { + param.graph_mem = parse_allocator(conf.at("graph_memory_type")); + } + if (conf.contains("internal_dataset_memory_type")) { + param.dataset_mem = parse_allocator(conf.at("internal_dataset_memory_type")); + } +} +#endif diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h new file mode 100644 index 000000000..cb30c2693 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::ann { + +inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric) +{ + if (metric == raft::bench::ann::Metric::kInnerProduct) { + return raft::distance::DistanceType::InnerProduct; + } else if (metric == raft::bench::ann::Metric::kEuclidean) { + // Even for L2 expanded RAFT IVF Flat uses unexpanded formula + return raft::distance::DistanceType::L2Expanded; + } else { + throw std::runtime_error("raft supports only metric type of inner product and L2"); + } +} +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu new file mode 100644 index 000000000..f8c65a2d6 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" + +#include "raft_ann_bench_param_parser.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + raft::bench::ann::Metric metric = parse_metric(distance); + std::unique_ptr> ann; + + if constexpr (std::is_same_v) { +#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN + if (algo == "raft_bfknn") { ann = std::make_unique>(metric, dim); } +#endif + } + + if constexpr (std::is_same_v) {} + +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT + if (algo == "raft_ivf_flat") { + typename raft::bench::ann::RaftIvfFlatGpu::BuildParam param; + parse_build_param(conf, param); + ann = std::make_unique>(metric, dim, param); + } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ + if (algo == "raft_ivf_pq") { + typename raft::bench::ann::RaftIvfPQ::BuildParam param; + parse_build_param(conf, param); + ann = std::make_unique>(metric, dim, param); + } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA + if (algo == "raft_cagra") { + typename raft::bench::ann::RaftCagra::BuildParam param; + parse_build_param(conf, param); + ann = std::make_unique>(metric, dim, param); + } +#endif + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ +#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN + if (algo == "raft_brute_force") { + auto param = std::make_unique::AnnSearchParam>(); + return param; + } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT + if (algo == "raft_ivf_flat") { + auto param = + std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ + if (algo == "raft_ivf_pq") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA + if (algo == "raft_cagra") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } +#endif + + // else + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +}; // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr; + // Construct a resource that uses a coalescing best-fit pool allocator + rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; + rmm::mr::set_current_device_resource( + &pool_mr); // Updates the current device resource pointer to `pool_mr` + rmm::mr::device_memory_resource* mr = + rmm::mr::get_current_device_resource(); // Points to `pool_mr` + return raft::bench::ann::run_main(argc, argv); +} +#endif diff --git a/cpp/bench/ann/src/raft/raft_cagra.cu b/cpp/bench/ann/src/raft/raft_cagra.cu new file mode 100644 index 000000000..be18af7f2 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra.cu @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "raft_cagra_wrapper.h" + +namespace raft::bench::ann { +template class RaftCagra; +template class RaftCagra; +template class RaftCagra; +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu new file mode 100644 index 000000000..ce6fa255b --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "raft_ann_bench_param_parser.h" +#include "raft_cagra_hnswlib_wrapper.h" + +#include + +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagraHnswlib::SearchParam& param) +{ + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + raft::bench::ann::Metric metric = parse_metric(distance); + std::unique_ptr> ann; + + if constexpr (std::is_same_v or std::is_same_v) { + if (algo == "raft_cagra_hnswlib") { + typename raft::bench::ann::RaftCagraHnswlib::BuildParam param; + parse_build_param(conf, param); + ann = std::make_unique>(metric, dim, param); + } + } + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "raft_cagra_hnswlib") { + auto param = + std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } + + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +} // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr; + // Construct a resource that uses a coalescing best-fit pool allocator + rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; + rmm::mr::set_current_device_resource( + &pool_mr); // Updates the current device resource pointer to `pool_mr` + rmm::mr::device_memory_resource* mr = + rmm::mr::get_current_device_resource(); // Points to `pool_mr` + return raft::bench::ann::run_main(argc, argv); +} +#endif diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h new file mode 100644 index 000000000..432caecfc --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../hnswlib/hnswlib_wrapper.h" +#include "raft_cagra_wrapper.h" +#include + +namespace raft::bench::ann { + +template +class RaftCagraHnswlib : public ANN { + public: + using typename ANN::AnnSearchParam; + using BuildParam = typename RaftCagra::BuildParam; + using SearchParam = typename HnswLib::SearchParam; + + RaftCagraHnswlib(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) + : ANN(metric, dim), + metric_(metric), + index_params_(param), + dimension_(dim), + handle_(cudaStreamPerThread) + { + } + + ~RaftCagraHnswlib() noexcept {} + + void build(const T* dataset, size_t nrow, cudaStream_t stream) final; + + void set_search_param(const AnnSearchParam& param) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + // to enable dataset access from GPU memory + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::HostMmap; + property.query_memory_type = MemoryType::Host; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + + private: + raft::device_resources handle_; + Metric metric_; + BuildParam index_params_; + int dimension_; + + std::unique_ptr> cagra_build_; + std::unique_ptr> hnswlib_search_; + + Objective metric_objective_; +}; + +template +void RaftCagraHnswlib::build(const T* dataset, size_t nrow, cudaStream_t stream) +{ + if (not cagra_build_) { + cagra_build_ = std::make_unique>(metric_, dimension_, index_params_); + } + cagra_build_->build(dataset, nrow, stream); +} + +template +void RaftCagraHnswlib::set_search_param(const AnnSearchParam& param_) +{ + hnswlib_search_->set_search_param(param_); +} + +template +void RaftCagraHnswlib::save(const std::string& file) const +{ + cagra_build_->save_to_hnswlib(file); +} + +template +void RaftCagraHnswlib::load(const std::string& file) +{ + typename HnswLib::BuildParam param; + // these values don't matter since we don't build with HnswLib + param.M = 50; + param.ef_construction = 100; + if (not hnswlib_search_) { + hnswlib_search_ = std::make_unique>(metric_, dimension_, param); + } + hnswlib_search_->load(file); + hnswlib_search_->set_base_layer_only(); +} + +template +void RaftCagraHnswlib::search( + const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const +{ + hnswlib_search_->search(queries, batch_size, k, neighbors, distances); +} + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h new file mode 100644 index 000000000..a3e481ec5 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "raft_ann_bench_utils.h" +#include + +#include "../common/cuda_huge_page_resource.hpp" +#include "../common/cuda_pinned_resource.hpp" + +#include +#include + +namespace raft::bench::ann { + +enum class AllocatorType { HostPinned, HostHugePage, Device }; +template +class RaftCagra : public ANN { + public: + using typename ANN::AnnSearchParam; + + struct SearchParam : public AnnSearchParam { + raft::neighbors::experimental::cagra::search_params p; + AllocatorType graph_mem = AllocatorType::Device; + AllocatorType dataset_mem = AllocatorType::Device; + auto needs_dataset() const -> bool override { return true; } + }; + + struct BuildParam { + raft::neighbors::cagra::index_params cagra_params; + std::optional nn_descent_params = + std::nullopt; + std::optional ivf_pq_refine_rate = std::nullopt; + std::optional ivf_pq_build_params = std::nullopt; + std::optional ivf_pq_search_params = std::nullopt; + }; + + RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) + : ANN(metric, dim), + index_params_(param), + dimension_(dim), + handle_(cudaStreamPerThread), + need_dataset_update_(true), + dataset_(make_device_matrix(handle_, 0, 0)), + graph_(make_device_matrix(handle_, 0, 0)), + input_dataset_v_(nullptr, 0, 0), + graph_mem_(AllocatorType::Device), + dataset_mem_(AllocatorType::Device) + { + index_params_.cagra_params.metric = parse_metric_type(metric); + index_params_.ivf_pq_build_params->metric = parse_metric_type(metric); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); + } + + ~RaftCagra() noexcept {} + + void build(const T* dataset, size_t nrow, cudaStream_t stream) final; + + void set_search_param(const AnnSearchParam& param) override; + + void set_search_dataset(const T* dataset, size_t nrow) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + // to enable dataset access from GPU memory + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::HostMmap; + property.query_memory_type = MemoryType::Device; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + void save_to_hnswlib(const std::string& file) const; + + private: + inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type) + { + switch (mem_type) { + case (AllocatorType::HostPinned): return &mr_pinned_; + case (AllocatorType::HostHugePage): return &mr_huge_page_; + default: return rmm::mr::get_current_device_resource(); + } + } + raft ::mr::cuda_pinned_resource mr_pinned_; + raft ::mr::cuda_huge_page_resource mr_huge_page_; + raft::device_resources handle_; + AllocatorType graph_mem_; + AllocatorType dataset_mem_; + BuildParam index_params_; + bool need_dataset_update_; + raft::neighbors::cagra::search_params search_params_; + std::optional> index_; + int device_; + int dimension_; + raft::device_matrix graph_; + raft::device_matrix dataset_; + raft::device_matrix_view input_dataset_v_; +}; + +template +void RaftCagra::build(const T* dataset, size_t nrow, cudaStream_t) +{ + auto dataset_view = + raft::make_host_matrix_view(dataset, IdxT(nrow), dimension_); + + auto& params = index_params_.cagra_params; + + index_.emplace(raft::neighbors::cagra::detail::build(handle_, + params, + dataset_view, + index_params_.nn_descent_params, + index_params_.ivf_pq_refine_rate, + index_params_.ivf_pq_build_params, + index_params_.ivf_pq_search_params)); + return; +} + +inline std::string allocator_to_string(AllocatorType mem_type) +{ + if (mem_type == AllocatorType::Device) { + return "device"; + } else if (mem_type == AllocatorType::HostPinned) { + return "host_pinned"; + } else if (mem_type == AllocatorType::HostHugePage) { + return "host_huge_page"; + } + return ""; +} + +template +void RaftCagra::set_search_param(const AnnSearchParam& param) +{ + auto search_param = dynamic_cast(param); + search_params_ = search_param.p; + if (search_param.graph_mem != graph_mem_) { + // Move graph to correct memory space + graph_mem_ = search_param.graph_mem; + RAFT_LOG_INFO("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str()); + // We create a new graph and copy to it from existing graph + auto mr = get_mr(graph_mem_); + auto new_graph = make_device_mdarray( + handle_, mr, make_extents(index_->graph().extent(0), index_->graph_degree())); + + raft::copy(new_graph.data_handle(), + index_->graph().data_handle(), + index_->graph().size(), + resource::get_cuda_stream(handle_)); + + index_->update_graph(handle_, make_const_mdspan(new_graph.view())); + // update_graph() only stores a view in the index. We need to keep the graph object alive. + graph_ = std::move(new_graph); + } + + if (search_param.dataset_mem != dataset_mem_ || need_dataset_update_) { + dataset_mem_ = search_param.dataset_mem; + + // First free up existing memory + dataset_ = make_device_matrix(handle_, 0, 0); + index_->update_dataset(handle_, make_const_mdspan(dataset_.view())); + + // Allocate space using the correct memory resource. + RAFT_LOG_INFO("moving dataset to new memory space: %s", + allocator_to_string(dataset_mem_).c_str()); + + auto mr = get_mr(dataset_mem_); + raft::neighbors::cagra::detail::copy_with_padding(handle_, dataset_, input_dataset_v_, mr); + + index_->update_dataset(handle_, make_const_mdspan(dataset_.view())); + + // Ideally, instead of dataset_.view(), we should pass a strided matrix view to update. + // See Issue https://github.com/rapidsai/raft/issues/1972 for details. + // auto dataset_view = make_device_strided_matrix_view( + // dataset_.data_handle(), dataset_.extent(0), this->dim_, dataset_.extent(1)); + // index_->update_dataset(handle_, dataset_view); + need_dataset_update_ = false; + } +} + +template +void RaftCagra::set_search_dataset(const T* dataset, size_t nrow) +{ + // It can happen that we are re-using a previous algo object which already has + // the dataset set. Check if we need update. + if (static_cast(input_dataset_v_.extent(0)) != nrow || + input_dataset_v_.data_handle() != dataset) { + input_dataset_v_ = make_device_matrix_view(dataset, nrow, this->dim_); + need_dataset_update_ = true; + } +} + +template +void RaftCagra::save(const std::string& file) const +{ + raft::neighbors::cagra::serialize(handle_, file, *index_); +} + +template +void RaftCagra::save_to_hnswlib(const std::string& file) const +{ + raft::neighbors::cagra::serialize_to_hnswlib(handle_, file, *index_); +} + +template +void RaftCagra::load(const std::string& file) +{ + index_ = raft::neighbors::cagra::deserialize(handle_, file); +} + +template +void RaftCagra::search( + const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const +{ + IdxT* neighbors_IdxT; + rmm::device_uvector neighbors_storage(0, resource::get_cuda_stream(handle_)); + if constexpr (std::is_same::value) { + neighbors_IdxT = neighbors; + } else { + neighbors_storage.resize(batch_size * k, resource::get_cuda_stream(handle_)); + neighbors_IdxT = neighbors_storage.data(); + } + + auto queries_view = + raft::make_device_matrix_view(queries, batch_size, dimension_); + auto neighbors_view = raft::make_device_matrix_view(neighbors_IdxT, batch_size, k); + auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); + + raft::neighbors::cagra::search( + handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + + if (!std::is_same::value) { + raft::linalg::unaryOp(neighbors, + neighbors_IdxT, + batch_size * k, + raft::cast_op(), + raft::resource::get_cuda_stream(handle_)); + } + + handle_.sync_stream(); +} +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu new file mode 100644 index 000000000..bcd23723a --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ivf_flat.cu @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "raft_ivf_flat_wrapper.h" + +namespace raft::bench::ann { +template class RaftIvfFlatGpu; +template class RaftIvfFlatGpu; +template class RaftIvfFlatGpu; +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h new file mode 100644 index 000000000..24b3c69bb --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "raft_ann_bench_utils.h" +#include + +namespace raft::bench::ann { + +template +class RaftIvfFlatGpu : public ANN { + public: + using typename ANN::AnnSearchParam; + + struct SearchParam : public AnnSearchParam { + raft::neighbors::ivf_flat::search_params ivf_flat_params; + }; + + using BuildParam = raft::neighbors::ivf_flat::index_params; + + RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), index_params_(param), dimension_(dim) + { + index_params_.metric = parse_metric_type(metric); + index_params_.conservative_memory_allocation = true; + RAFT_CUDA_TRY(cudaGetDevice(&device_)); + } + + ~RaftIvfFlatGpu() noexcept {} + + void build(const T* dataset, size_t nrow, cudaStream_t stream) final; + + void set_search_param(const AnnSearchParam& param) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + // to enable dataset access from GPU memory + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::Device; + property.query_memory_type = MemoryType::Device; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + + private: + raft::device_resources handle_; + BuildParam index_params_; + raft::neighbors::ivf_flat::search_params search_params_; + std::optional> index_; + int device_; + int dimension_; +}; + +template +void RaftIvfFlatGpu::build(const T* dataset, size_t nrow, cudaStream_t) +{ + index_.emplace( + raft::neighbors::ivf_flat::build(handle_, index_params_, dataset, IdxT(nrow), dimension_)); + return; +} + +template +void RaftIvfFlatGpu::set_search_param(const AnnSearchParam& param) +{ + auto search_param = dynamic_cast(param); + search_params_ = search_param.ivf_flat_params; + assert(search_params_.n_probes <= index_params_.n_lists); +} + +template +void RaftIvfFlatGpu::save(const std::string& file) const +{ + raft::neighbors::ivf_flat::serialize(handle_, file, *index_); + return; +} + +template +void RaftIvfFlatGpu::load(const std::string& file) +{ + index_ = raft::neighbors::ivf_flat::deserialize(handle_, file); + return; +} + +template +void RaftIvfFlatGpu::search( + const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const +{ + static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t"); + raft::neighbors::ivf_flat::search( + handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances); + resource::sync_stream(handle_); + return; +} +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu new file mode 100644 index 000000000..2efe14631 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "raft_ivf_pq_wrapper.h" + +namespace raft::bench::ann { +template class RaftIvfPQ; +template class RaftIvfPQ; +template class RaftIvfPQ; +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h new file mode 100644 index 000000000..e4004b000 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "raft_ann_bench_utils.h" +#include + +namespace raft::bench::ann { + +template +class RaftIvfPQ : public ANN { + public: + using typename ANN::AnnSearchParam; + using ANN::dim_; + + struct SearchParam : public AnnSearchParam { + raft::neighbors::ivf_pq::search_params pq_param; + float refine_ratio = 1.0f; + auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; } + }; + + using BuildParam = raft::neighbors::ivf_pq::index_params; + + RaftIvfPQ(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), index_params_(param), dimension_(dim) + { + index_params_.metric = parse_metric_type(metric); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); + RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); + } + + ~RaftIvfPQ() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); } + + void build(const T* dataset, size_t nrow, cudaStream_t stream) final; + + void set_search_param(const AnnSearchParam& param) override; + void set_search_dataset(const T* dataset, size_t nrow) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + // to enable dataset access from GPU memory + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::Host; + property.query_memory_type = MemoryType::Device; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + + private: + raft::device_resources handle_; + cudaEvent_t sync_{nullptr}; + BuildParam index_params_; + raft::neighbors::ivf_pq::search_params search_params_; + std::optional> index_; + int device_; + int dimension_; + float refine_ratio_ = 1.0; + raft::device_matrix_view dataset_; + + void stream_wait(cudaStream_t stream) const + { + RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_))); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_)); + } +}; + +template +void RaftIvfPQ::save(const std::string& file) const +{ + raft::runtime::neighbors::ivf_pq::serialize(handle_, file, *index_); +} + +template +void RaftIvfPQ::load(const std::string& file) +{ + auto index_tmp = raft::neighbors::ivf_pq::index(handle_, index_params_, dimension_); + raft::runtime::neighbors::ivf_pq::deserialize(handle_, file, &index_tmp); + index_.emplace(std::move(index_tmp)); + return; +} + +template +void RaftIvfPQ::build(const T* dataset, size_t nrow, cudaStream_t stream) +{ + auto dataset_v = raft::make_device_matrix_view(dataset, IdxT(nrow), dim_); + + index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v)); + stream_wait(stream); +} + +template +void RaftIvfPQ::set_search_param(const AnnSearchParam& param) +{ + auto search_param = dynamic_cast(param); + search_params_ = search_param.pq_param; + refine_ratio_ = search_param.refine_ratio; + assert(search_params_.n_probes <= index_params_.n_lists); +} + +template +void RaftIvfPQ::set_search_dataset(const T* dataset, size_t nrow) +{ + dataset_ = raft::make_device_matrix_view(dataset, nrow, index_->dim()); +} + +template +void RaftIvfPQ::search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream) const +{ + if (refine_ratio_ > 1.0f) { + uint32_t k0 = static_cast(refine_ratio_ * k); + auto queries_v = + raft::make_device_matrix_view(queries, batch_size, index_->dim()); + auto distances_tmp = raft::make_device_matrix(handle_, batch_size, k0); + auto candidates = raft::make_device_matrix(handle_, batch_size, k0); + + raft::runtime::neighbors::ivf_pq::search( + handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view()); + + if (raft::get_device_for_address(dataset_.data_handle()) >= 0) { + auto queries_v = + raft::make_device_matrix_view(queries, batch_size, index_->dim()); + auto neighbors_v = raft::make_device_matrix_view((IdxT*)neighbors, batch_size, k); + auto distances_v = raft::make_device_matrix_view(distances, batch_size, k); + + raft::runtime::neighbors::refine(handle_, + dataset_, + queries_v, + candidates.view(), + neighbors_v, + distances_v, + index_->metric()); + stream_wait(stream); // RAFT stream -> bench stream + } else { + auto queries_host = raft::make_host_matrix(batch_size, index_->dim()); + auto candidates_host = raft::make_host_matrix(batch_size, k0); + auto neighbors_host = raft::make_host_matrix(batch_size, k); + auto distances_host = raft::make_host_matrix(batch_size, k); + + raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream); + raft::copy(candidates_host.data_handle(), + candidates.data_handle(), + candidates_host.size(), + resource::get_cuda_stream(handle_)); + + auto dataset_v = raft::make_host_matrix_view( + dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1)); + + // wait for the queries to copy to host in 'stream` and for IVF-PQ::search to finish + RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_))); + RAFT_CUDA_TRY(cudaEventRecord(sync_, stream)); + RAFT_CUDA_TRY(cudaEventSynchronize(sync_)); + raft::runtime::neighbors::refine(handle_, + dataset_v, + queries_host.view(), + candidates_host.view(), + neighbors_host.view(), + distances_host.view(), + index_->metric()); + + raft::copy(neighbors, (size_t*)neighbors_host.data_handle(), neighbors_host.size(), stream); + raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream); + } + } else { + auto queries_v = + raft::make_device_matrix_view(queries, batch_size, index_->dim()); + auto neighbors_v = raft::make_device_matrix_view((IdxT*)neighbors, batch_size, k); + auto distances_v = raft::make_device_matrix_view(distances, batch_size, k); + + raft::runtime::neighbors::ivf_pq::search( + handle_, search_params_, *index_, queries_v, neighbors_v, distances_v); + stream_wait(stream); // RAFT stream -> bench stream + } +} +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h new file mode 100644 index 000000000..499bdf29a --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_wrapper.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" + +namespace raft_temp { + +inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric) +{ + if (metric == raft::bench::ann::Metric::kInnerProduct) { + return raft::distance::DistanceType::InnerProduct; + } else if (metric == raft::bench::ann::Metric::kEuclidean) { + return raft::distance::DistanceType::L2Expanded; + } else { + throw std::runtime_error("raft supports only metric type of inner product and L2"); + } +} + +} // namespace raft_temp + +namespace raft::bench::ann { + +// brute force fused L2 KNN - RAFT +template +class RaftGpu : public ANN { + public: + using typename ANN::AnnSearchParam; + + RaftGpu(Metric metric, int dim); + + void build(const T*, size_t, cudaStream_t) final; + + void set_search_param(const AnnSearchParam& param) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const final; + + // to enable dataset access from GPU memory + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::Device; + property.query_memory_type = MemoryType::Device; + return property; + } + void set_search_dataset(const T* dataset, size_t nrow) override; + void save(const std::string& file) const override; + void load(const std::string&) override { return; }; + + protected: + raft::distance::DistanceType metric_type_; + int device_; + const T* dataset_; + size_t nrow_; +}; + +template +RaftGpu::RaftGpu(Metric metric, int dim) + : ANN(metric, dim), metric_type_(raft_temp::parse_metric_type(metric)) +{ + static_assert(std::is_same_v, "raft support only float type"); + assert(metric_type_ == raft::distance::DistanceType::L2Expanded); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); +} + +template +void RaftGpu::build(const T*, size_t, cudaStream_t) +{ + // as this is brute force algo so no index building required + return; +} + +template +void RaftGpu::set_search_param(const AnnSearchParam&) +{ + // Nothing to set here as it is brute force implementation +} + +template +void RaftGpu::set_search_dataset(const T* dataset, size_t nrow) +{ + dataset_ = dataset; + nrow_ = nrow; +} + +template +void RaftGpu::save(const std::string& file) const +{ + // create a empty index file as no index to store. + std::fstream fp; + fp.open(file.c_str(), std::ios::out); + if (!fp) { + printf("Error in creating file!!!\n"); + ; + return; + } + fp.close(); +} + +template +void RaftGpu::search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream) const +{ + // TODO: Integrate new `raft::brute_force::index` (from + // https://github.com/rapidsai/raft/pull/1817) + raft::spatial::knn::detail::fusedL2Knn(this->dim_, + reinterpret_cast(neighbors), + distances, + dataset_, + queries, + nrow_, + static_cast(batch_size), + k, + true, + true, + stream, + metric_type_); +} + +} // namespace raft::bench::ann diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt new file mode 100644 index 000000000..fe58453d0 --- /dev/null +++ b/cpp/bench/prims/CMakeLists.txt @@ -0,0 +1,166 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# ################################################################################################## +# * compiler function ----------------------------------------------------------------------------- + +function(ConfigureBench) + + set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY) + set(oneValueArgs NAME) + set(multiValueArgs PATH TARGETS CONFIGURATIONS) + + cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(BENCH_NAME ${ConfigureBench_NAME}) + + add_executable(${BENCH_NAME} ${ConfigureBench_PATH}) + + target_link_libraries( + ${BENCH_NAME} + PRIVATE raft::raft + raft_internal + $<$:raft::compiled> + ${RAFT_CTK_MATH_DEPENDENCIES} + benchmark::benchmark + Threads::Threads + $ + $ + ) + + set_target_properties( + ${BENCH_NAME} + PROPERTIES # set target compile options + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + ) + + target_compile_options( + ${BENCH_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) + + if(ConfigureTest_EXPLICIT_INSTANTIATE_ONLY) + target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") + endif() + + target_include_directories( + ${BENCH_NAME} PUBLIC "$" + ) + + install( + TARGETS ${BENCH_NAME} + COMPONENT testing + DESTINATION bin/gbench/prims/libraft + EXCLUDE_FROM_ALL + ) + +endfunction() + +if(BUILD_PRIMS_BENCH) + ConfigureBench( + NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp + ) + + ConfigureBench( + NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu + bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureBench( + NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu + bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp + ) + + ConfigureBench( + NAME + DISTANCE_BENCH + PATH + bench/prims/distance/distance_cosine.cu + bench/prims/distance/distance_exp_l2.cu + bench/prims/distance/distance_l1.cu + bench/prims/distance/distance_unexp_l2.cu + bench/prims/distance/fused_l2_nn.cu + bench/prims/distance/masked_nn.cu + bench/prims/distance/kernels.cu + bench/prims/main.cpp + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureBench( + NAME + LINALG_BENCH + PATH + bench/prims/linalg/add.cu + bench/prims/linalg/map_then_reduce.cu + bench/prims/linalg/matrix_vector_op.cu + bench/prims/linalg/norm.cu + bench/prims/linalg/normalize.cu + bench/prims/linalg/reduce_cols_by_key.cu + bench/prims/linalg/reduce_rows_by_key.cu + bench/prims/linalg/reduce.cu + bench/prims/main.cpp + ) + + ConfigureBench( + NAME + MATRIX_BENCH + PATH + bench/prims/matrix/argmin.cu + bench/prims/matrix/gather.cu + bench/prims/matrix/select_k.cu + bench/prims/matrix/main.cpp + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureBench( + NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu + bench/prims/random/rng.cu bench/prims/main.cpp + ) + + ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp) + + ConfigureBench( + NAME + NEIGHBORS_BENCH + PATH + bench/prims/neighbors/knn/brute_force_float_int64_t.cu + bench/prims/neighbors/knn/brute_force_float_uint32_t.cu + bench/prims/neighbors/knn/cagra_float_uint32_t.cu + bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu + bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu + bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu + bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu + bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu + bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu + bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu + bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu + bench/prims/neighbors/refine_float_int64_t.cu + bench/prims/neighbors/refine_uint8_t_int64_t.cu + bench/prims/main.cpp + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + +endif() diff --git a/cpp/bench/prims/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu new file mode 100644 index 000000000..3147960f7 --- /dev/null +++ b/cpp/bench/prims/cluster/kmeans.cu @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft::bench::cluster { + +struct KMeansBenchParams { + DatasetParams data; + BlobsParams blobs; + raft::cluster::KMeansParams kmeans; +}; + +inline auto operator<<(std::ostream& os, const KMeansBenchParams& p) -> std::ostream& +{ + os << p.data.rows << "#" << p.data.cols << "#" << p.kmeans.n_clusters; + return os; +} + +template +struct KMeans : public BlobsFixture { + KMeans(const KMeansBenchParams& p) + : BlobsFixture(p.data, p.blobs), + params(p), + centroids(this->handle), + labels(this->handle) + { + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + raft::device_matrix_view X_view = this->X.view(); + std::optional> opt_weights_view = std::nullopt; + std::optional> centroids_view = + std::make_optional>(centroids.view()); + raft::device_vector_view labels_view = labels.view(); + raft::host_scalar_view inertia_view = raft::make_host_scalar_view(&inertia); + raft::host_scalar_view n_iter_view = raft::make_host_scalar_view(&n_iter); + + this->loop_on_state(state, [&]() { + raft::cluster::kmeans_fit_predict(this->handle, + params.kmeans, + X_view, + opt_weights_view, + centroids_view, + labels_view, + inertia_view, + n_iter_view); + }); + } + + void allocate_temp_buffers(const ::benchmark::State& state) override + { + centroids = + raft::make_device_matrix(this->handle, params.kmeans.n_clusters, params.data.cols); + labels = raft::make_device_vector(this->handle, params.data.rows); + } + + private: + KMeansBenchParams params; + raft::device_matrix centroids; + raft::device_vector labels; + T inertia; + IndexT n_iter; +}; // struct KMeans + +std::vector getKMeansInputs() +{ + std::vector out; + KMeansBenchParams p; + p.data.row_major = true; + p.blobs.cluster_std = 1.0; + p.blobs.shuffle = false; + p.blobs.center_box_min = -10.0; + p.blobs.center_box_max = 10.0; + p.blobs.seed = 12345ULL; + p.kmeans.init = raft::cluster::KMeansParams::KMeansPlusPlus; + p.kmeans.max_iter = 300; + p.kmeans.tol = 1e-4; + p.kmeans.verbosity = RAFT_LEVEL_INFO; + p.kmeans.metric = raft::distance::DistanceType::L2Expanded; + p.kmeans.inertia_check = true; + std::vector> row_cols_k = { + {1000000, 20, 1000}, + {3000000, 50, 20}, + {10000000, 50, 5}, + }; + for (auto& rck : row_cols_k) { + p.data.rows = std::get<0>(rck); + p.data.cols = std::get<1>(rck); + p.blobs.n_clusters = std::get<2>(rck); + p.kmeans.n_clusters = std::get<2>(rck); + out.push_back(p); + } + return out; +} + +// note(lsugy): commenting out int64_t because the templates are not compiled in the distance +// library, resulting in long compilation times. +RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); +RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); +// RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); +// RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); + +} // namespace raft::bench::cluster diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu new file mode 100644 index 000000000..129578c30 --- /dev/null +++ b/cpp/bench/prims/cluster/kmeans_balanced.cu @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft::bench::cluster { + +struct KMeansBalancedBenchParams { + DatasetParams data; + uint32_t n_lists; + raft::cluster::kmeans_balanced_params kb_params; +}; + +template +struct KMeansBalanced : public fixture { + KMeansBalanced(const KMeansBalancedBenchParams& p) : params(p), X(handle), centroids(handle) {} + + void run_benchmark(::benchmark::State& state) override + { + this->loop_on_state(state, [this]() { + raft::device_matrix_view X_view = this->X.view(); + raft::device_matrix_view centroids_view = this->centroids.view(); + raft::cluster::kmeans_balanced::fit( + this->handle, this->params.kb_params, X_view, centroids_view); + }); + } + + void allocate_data(const ::benchmark::State& state) override + { + X = raft::make_device_matrix(handle, params.data.rows, params.data.cols); + + raft::random::RngState rng{1234}; + constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); + constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); + if constexpr (std::is_integral_v) { + raft::random::uniformInt( + handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); + } else { + raft::random::uniform( + handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); + } + resource::sync_stream(handle, stream); + } + + void allocate_temp_buffers(const ::benchmark::State& state) override + { + centroids = + raft::make_device_matrix(this->handle, params.n_lists, params.data.cols); + } + + private: + KMeansBalancedBenchParams params; + raft::device_matrix X; + raft::device_matrix centroids; +}; // struct KMeansBalanced + +std::vector getKMeansBalancedInputs() +{ + std::vector out; + KMeansBalancedBenchParams p; + p.data.row_major = true; + p.kb_params.n_iters = 20; + p.kb_params.metric = raft::distance::DistanceType::L2Expanded; + std::vector> row_cols = { + {100000, 128}, {1000000, 128}, {10000000, 128}, + // The following dataset sizes are too large for most GPUs. + // {100000000, 128}, + }; + for (auto& rc : row_cols) { + p.data.rows = rc.first; + p.data.cols = rc.second; + for (auto n_lists : std::vector({1000, 10000, 100000})) { + p.n_lists = n_lists; + out.push_back(p); + } + } + return out; +} + +// Note: the datasets sizes are too large for 32-bit index types. +RAFT_BENCH_REGISTER((KMeansBalanced), "", getKMeansBalancedInputs()); + +} // namespace raft::bench::cluster diff --git a/cpp/bench/prims/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp new file mode 100644 index 000000000..d3da3bff6 --- /dev/null +++ b/cpp/bench/prims/common/benchmark.hpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace raft::bench { + +/** + * RAII way to temporary set the pooling memory allocator in rmm. + * This may be useful for benchmarking functions that do some memory allocations. + */ +struct using_pool_memory_res { + private: + rmm::mr::device_memory_resource* orig_res_; + rmm::mr::cuda_memory_resource cuda_res_; + rmm::mr::pool_memory_resource pool_res_; + + public: + using_pool_memory_res(size_t initial_size, size_t max_size) + : orig_res_(rmm::mr::get_current_device_resource()), + pool_res_(&cuda_res_, initial_size, max_size) + { + rmm::mr::set_current_device_resource(&pool_res_); + } + + using_pool_memory_res() : orig_res_(rmm::mr::get_current_device_resource()), pool_res_(&cuda_res_) + { + rmm::mr::set_current_device_resource(&pool_res_); + } + + ~using_pool_memory_res() { rmm::mr::set_current_device_resource(orig_res_); } +}; + +/** + * RAII way of timing cuda calls. This has been shamelessly copied from the + * cudf codebase via cuml codebase. So, credits for this class goes to cudf developers. + */ +struct cuda_event_timer { + private: + ::benchmark::State* state_; + rmm::cuda_stream_view stream_; + cudaEvent_t start_; + cudaEvent_t stop_; + + public: + /** + * @param state the benchmark::State whose timer we are going to update. + * @param stream CUDA stream we are measuring time on. + */ + cuda_event_timer(::benchmark::State& state, rmm::cuda_stream_view stream) + : state_(&state), stream_(stream) + { + RAFT_CUDA_TRY(cudaEventCreate(&start_)); + RAFT_CUDA_TRY(cudaEventCreate(&stop_)); + raft::interruptible::synchronize(stream_); + RAFT_CUDA_TRY(cudaEventRecord(start_, stream_)); + } + cuda_event_timer() = delete; + + /** + * @brief The dtor stops the timer and performs a synchroniazation. Time of + * the benchmark::State object provided to the ctor will be set to the + * value given by `cudaEventElapsedTime()`. + */ + ~cuda_event_timer() + { + RAFT_CUDA_TRY_NO_THROW(cudaEventRecord(stop_, stream_)); + raft::interruptible::synchronize(stop_); + float milliseconds = 0.0f; + RAFT_CUDA_TRY_NO_THROW(cudaEventElapsedTime(&milliseconds, start_, stop_)); + state_->SetIterationTime(milliseconds / 1000.f); + RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(start_)); + RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(stop_)); + } +}; + +/** Main fixture to be inherited and used by all other c++ benchmarks */ +class fixture { + private: + rmm::device_buffer scratch_buf_; + + public: + raft::device_resources handle; + rmm::cuda_stream_view stream; + + fixture(bool use_pool_memory_resource = false) : stream{resource::get_cuda_stream(handle)} + { + // Cache memory pool between test runs, since it is expensive to create. + // This speeds up the time required to run the select_k bench by over 3x. + // This is part of the fixture class here so that the pool will get cleaned + // up, rather than outliving the benchmarks that require it. + static std::unique_ptr memory_pool; + if (use_pool_memory_resource) { + if (!memory_pool) { memory_pool.reset(new using_pool_memory_res()); } + } else if (memory_pool) { + memory_pool.reset(); + } + + int l2_cache_size = 0; + int device_id = 0; + RAFT_CUDA_TRY(cudaGetDevice(&device_id)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id)); + scratch_buf_ = rmm::device_buffer(l2_cache_size * 3, stream); + } + + // every benchmark should be overriding this + virtual void run_benchmark(::benchmark::State& state) = 0; + virtual void generate_metrics(::benchmark::State& state) {} + virtual void allocate_data(const ::benchmark::State& state) {} + virtual void deallocate_data(const ::benchmark::State& state) {} + virtual void allocate_temp_buffers(const ::benchmark::State& state) {} + virtual void deallocate_temp_buffers(const ::benchmark::State& state) {} + + protected: + /** The helper that writes zeroes to some buffer in GPU memory to flush the L2 cache. */ + void flush_L2_cache() + { + RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream)); + } + + /** + * The helper to be used inside `run_benchmark`, to loop over the state and record time using the + * cuda_event_timer. + */ + template + void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true) + { + for (auto _ : state) { + if (flush_L2) { flush_L2_cache(); } + cuda_event_timer timer(state, stream); + benchmark_func(); + } + } +}; + +/** Indicates the dataset size. */ +struct DatasetParams { + size_t rows; + size_t cols; + bool row_major; +}; + +/** Holds params needed to generate blobs dataset */ +struct BlobsParams { + int n_clusters; + double cluster_std; + bool shuffle; + double center_box_min, center_box_max; + uint64_t seed; +}; + +/** Fixture for cluster benchmarks using make_blobs */ +template +class BlobsFixture : public fixture { + public: + BlobsFixture(const DatasetParams dp, const BlobsParams bp) + : data_params(dp), blobs_params(bp), X(this->handle) + { + } + + virtual void run_benchmark(::benchmark::State& state) = 0; + + void allocate_data(const ::benchmark::State& state) override + { + auto labels_ref = raft::make_device_vector(this->handle, data_params.rows); + X = raft::make_device_matrix(this->handle, data_params.rows, data_params.cols); + + raft::random::make_blobs(X.data_handle(), + labels_ref.data_handle(), + (IndexT)data_params.rows, + (IndexT)data_params.cols, + (IndexT)blobs_params.n_clusters, + stream, + data_params.row_major, + nullptr, + nullptr, + (T)blobs_params.cluster_std, + blobs_params.shuffle, + (T)blobs_params.center_box_min, + (T)blobs_params.center_box_max, + blobs_params.seed); + resource::sync_stream(this->handle, stream); + } + + protected: + DatasetParams data_params; + BlobsParams blobs_params; + raft::device_matrix X; +}; + +namespace internal { + +template +class Fixture : public ::benchmark::Fixture { + using State = ::benchmark::State; + + public: + explicit Fixture(const std::string name, const Params&... params) + : ::benchmark::Fixture(), params_(params...), name_(name) + { + SetName(name_.c_str()); + } + Fixture() = delete; + + void SetUp(const State& state) override + { + fixture_ = + std::apply([](const Params&... ps) { return std::make_unique(ps...); }, params_); + fixture_->allocate_data(state); + fixture_->allocate_temp_buffers(state); + } + + void TearDown(const State& state) override + { + fixture_->deallocate_temp_buffers(state); + fixture_->deallocate_data(state); + fixture_.reset(); + } + + void SetUp(State& st) override { SetUp(const_cast(st)); } + void TearDown(State& st) override { TearDown(const_cast(st)); } + + private: + std::unique_ptr fixture_; + std::tuple params_; + const std::string name_; + + protected: + void BenchmarkCase(State& state) override + { + fixture_->run_benchmark(state); + fixture_->generate_metrics(state); + } +}; // class Fixture + +/** + * A helper struct to create a fixture for every combination of input vectors. + * Use with care, this can blow up quickly! + */ +template +struct cartesian_registrar { + template + static void run(const std::string case_name, + const std::vector&... params, + const Fixed&... fixed); +}; + +template +struct cartesian_registrar { + template + static void run(const std::string case_name, const Fixed&... fixed) + { + auto* b = ::benchmark::internal::RegisterBenchmarkInternal( + new Fixture(case_name, fixed...)); + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } +}; + +template +struct cartesian_registrar { + template + static void run(const std::string case_name, + const std::vector& param, + const std::vector&... params, + const Fixed&... fixed) + { + int param_len = param.size(); + for (int i = 0; i < param_len; i++) { + cartesian_registrar::run( + case_name + "/" + std::to_string(i), params..., fixed..., param[i]); + } + } +}; + +template +struct registrar { + /** + * Register a fixture `Class` named `testClass` for every combination of input `params`. + * + * @param test_class + * A string representation of the `Class` name. + * @param test_name + * Optional test name. Leave empty, if you don't need it. + * @param params + * Zero or more vectors of parameters. + * The generated test cases are a cartesian product of these vectors. + * Use with care, this can blow up quickly! + */ + template + registrar(const std::string& test_class, + const std::string& test_name, + const std::vector&... params) + { + std::stringstream name_stream; + name_stream << test_class; + if (!test_name.empty()) { name_stream << "/" << test_name; } + cartesian_registrar::run(name_stream.str(), params...); + } +}; + +}; // namespace internal + +#define RAFT_BENCH_REGISTER_INTERNAL(TestClass, ...) \ + static raft::bench::internal::registrar BENCHMARK_PRIVATE_NAME(registrar)( \ + RAFT_STRINGIFY(TestClass), __VA_ARGS__) + +/** + * This is the entry point macro for all benchmarks. This needs to be called + * for the set of benchmarks to be registered so that the main harness inside + * google bench can find these benchmarks and run them. + * + * @param TestClass child class of `raft::bench::Fixture` which contains + * the logic to generate the dataset and run training on it + * for a given algo. Ideally, once such struct is needed for + * every algo to be benchmarked + * @param test_name a unique string to identify these tests at the end of run + * This is optional and if choose not to use this, pass an + * empty string + * @param params... zero or more lists of params upon which to benchmark. + */ +#define RAFT_BENCH_REGISTER(TestClass, ...) \ + RAFT_BENCH_REGISTER_INTERNAL(RAFT_DEPAREN(TestClass), __VA_ARGS__) + +} // namespace raft::bench diff --git a/cpp/bench/prims/core/bitset.cu b/cpp/bench/prims/core/bitset.cu new file mode 100644 index 000000000..ce3136bcd --- /dev/null +++ b/cpp/bench/prims/core/bitset.cu @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft::bench::core { + +struct bitset_inputs { + uint32_t bitset_len; + uint32_t mask_len; + uint32_t query_len; +}; // struct bitset_inputs + +template +struct bitset_bench : public fixture { + bitset_bench(const bitset_inputs& p) + : params(p), + mask{raft::make_device_vector(res, p.mask_len)}, + queries{raft::make_device_vector(res, p.query_len)}, + outputs{raft::make_device_vector(res, p.query_len)} + { + raft::random::RngState state{42}; + raft::random::uniformInt(res, state, mask.view(), index_t{0}, index_t{p.bitset_len}); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + auto my_bitset = raft::core::bitset( + this->res, raft::make_const_mdspan(mask.view()), params.bitset_len); + my_bitset.test(this->res, raft::make_const_mdspan(queries.view()), outputs.view()); + }); + } + + private: + raft::resources res; + bitset_inputs params; + raft::device_vector mask, queries; + raft::device_vector outputs; +}; // struct bitset + +const std::vector bitset_input_vecs{ + {256 * 1024 * 1024, 64 * 1024 * 1024, 256 * 1024 * 1024}, // Standard Bench + {256 * 1024 * 1024, 64 * 1024 * 1024, 1024 * 1024 * 1024}, // Extra queries + {128 * 1024 * 1024, 1024 * 1024 * 1024, 256 * 1024 * 1024}, // Extra mask to test atomics impact +}; + +using Uint8_32 = bitset_bench; +using Uint16_64 = bitset_bench; +using Uint32_32 = bitset_bench; +using Uint32_64 = bitset_bench; + +RAFT_BENCH_REGISTER(Uint8_32, "", bitset_input_vecs); +RAFT_BENCH_REGISTER(Uint16_64, "", bitset_input_vecs); +RAFT_BENCH_REGISTER(Uint32_32, "", bitset_input_vecs); +RAFT_BENCH_REGISTER(Uint32_64, "", bitset_input_vecs); + +} // namespace raft::bench::core diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu new file mode 100644 index 000000000..31ee83b92 --- /dev/null +++ b/cpp/bench/prims/core/copy.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::core { + +template +auto constexpr const default_dims = []() { + auto dims = std::array{}; + std::fill(dims.begin(), dims.end(), 2); + return dims; +}(); + +template +auto constexpr const default_dims = std::array{3000000}; + +template +auto constexpr const default_dims = std::array{1000, 3000}; + +template +auto constexpr const default_dims = std::array{20, 300, 500}; + +template > +struct bench_array_type; + +template +struct bench_array_type> { + template + auto static constexpr const extent_type = raft::dynamic_extent; + + using type = + std::conditional_t...>, LayoutPolicy>, + device_mdarray...>, LayoutPolicy>>; +}; + +template +struct params { + std::array dims = default_dims; + using src_array_type = + typename bench_array_type::type; + using dst_array_type = + typename bench_array_type::type; +}; + +template +struct CopyBench : public fixture { + using params_type = + params; + using src_array_type = typename params_type::src_array_type; + using dst_array_type = typename params_type::dst_array_type; + explicit CopyBench(const params_type& ps) + : fixture{true}, + res_{}, + params_{ps}, + src_{ + res_, + typename src_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename src_array_type::container_policy_type{}, + }, + dst_{ + res_, + typename dst_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename dst_array_type::container_policy_type{}, + } + { + res_.get_cublas_handle(); // initialize cublas handle + auto src_data = std::vector(src_.size()); + std::iota(src_data.begin(), src_data.end(), SrcT{}); + raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream()); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); }); + } + + private: + raft::device_resources res_; + params_type params_; + src_array_type src_; + dst_array_type dst_; +}; + +template +auto static const inputs = std::vector{ParamsT{}}; + +#define COPY_REGISTER(BenchT) \ + RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs) + +using copy_bench_device_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_3d_diff_dtype_same_layout = CopyBench; + +using copy_bench_host_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench; +using copy_bench_host_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_host_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_device_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_device_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_host_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_host_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_3d_diff_dtype_same_layout = CopyBench; + +// COPY_REGISTER(copy_bench_same_dtype_1d_host_host); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout); + +} // namespace raft::bench::core diff --git a/cpp/bench/prims/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh new file mode 100644 index 000000000..dff3401b6 --- /dev/null +++ b/cpp/bench/prims/distance/distance_common.cuh @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft::bench::distance { + +struct distance_params { + int m, n, k; + bool isRowMajor; +}; // struct distance_params + +template +struct distance : public fixture { + distance(const distance_params& p) + : params(p), + x(p.m * p.k, stream), + y(p.n * p.k, stream), + out(p.m * p.n, stream), + workspace(0, stream) + { + RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream)); + worksize = raft::distance::getWorkspaceSize( + x.data(), y.data(), params.m, params.n, params.k); + workspace.resize(worksize, stream); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::distance::distance(handle, + x.data(), + y.data(), + out.data(), + params.m, + params.n, + params.k, + (void*)workspace.data(), + worksize, + params.isRowMajor); + }); + } + + private: + distance_params params; + rmm::device_uvector x, y, out; + rmm::device_uvector workspace; + size_t worksize; +}; // struct Distance + +const std::vector dist_input_vecs{ + {32, 16384, 16384, true}, {64, 16384, 16384, true}, {128, 16384, 16384, true}, + {256, 16384, 16384, true}, {512, 16384, 16384, true}, {1024, 16384, 16384, true}, + {16384, 32, 16384, true}, {16384, 64, 16384, true}, {16384, 128, 16384, true}, + {16384, 256, 16384, true}, {16384, 512, 16384, true}, {16384, 1024, 16384, true}, + {16384, 16384, 32, true}, {16384, 16384, 64, true}, {16384, 16384, 128, true}, + {16384, 16384, 256, true}, {16384, 16384, 512, true}, {16384, 16384, 1024, true}, + {16384, 16384, 16384, true}, {32, 16384, 16384, false}, {64, 16384, 16384, false}, + {128, 16384, 16384, false}, {256, 16384, 16384, false}, {512, 16384, 16384, false}, + {1024, 16384, 16384, false}, {16384, 32, 16384, false}, {16384, 64, 16384, false}, + {16384, 128, 16384, false}, {16384, 256, 16384, false}, {16384, 512, 16384, false}, + {16384, 1024, 16384, false}, {16384, 16384, 32, false}, {16384, 16384, 64, false}, + {16384, 16384, 128, false}, {16384, 16384, 256, false}, {16384, 16384, 512, false}, + {16384, 16384, 1024, false}, {16384, 16384, 16384, false} + +}; + +#define DIST_BENCH_REGISTER(Name, Metric) \ + using Name##F = distance; \ + RAFT_BENCH_REGISTER(Name##F, "", dist_input_vecs); \ + using Name##D = distance; \ + RAFT_BENCH_REGISTER(Name##D, "", dist_input_vecs); + +} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_cosine.cu b/cpp/bench/prims/distance/distance_cosine.cu new file mode 100644 index 000000000..c8ac8067c --- /dev/null +++ b/cpp/bench/prims/distance/distance_cosine.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distance_common.cuh" + +namespace raft::bench::distance { + +DIST_BENCH_REGISTER(DistanceCosine, raft::distance::DistanceType::CosineExpanded); + +} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_exp_l2.cu b/cpp/bench/prims/distance/distance_exp_l2.cu new file mode 100644 index 000000000..52b7fff05 --- /dev/null +++ b/cpp/bench/prims/distance/distance_exp_l2.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distance_common.cuh" + +namespace raft::bench::distance { + +DIST_BENCH_REGISTER(DistanceL2Sq, raft::distance::DistanceType::L2Expanded); +DIST_BENCH_REGISTER(DistanceL2Sqrt, raft::distance::DistanceType::L2SqrtExpanded); + +} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_l1.cu b/cpp/bench/prims/distance/distance_l1.cu new file mode 100644 index 000000000..e80db48ef --- /dev/null +++ b/cpp/bench/prims/distance/distance_l1.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distance_common.cuh" + +namespace raft::bench::distance { + +DIST_BENCH_REGISTER(DistanceL1, raft::distance::DistanceType::L1); + +} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_unexp_l2.cu b/cpp/bench/prims/distance/distance_unexp_l2.cu new file mode 100644 index 000000000..7ac1a8a4b --- /dev/null +++ b/cpp/bench/prims/distance/distance_unexp_l2.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distance_common.cuh" + +namespace raft::bench::distance { + +DIST_BENCH_REGISTER(DistanceUnexpL2Sq, raft::distance::DistanceType::L2Unexpanded); +DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt, raft::distance::DistanceType::L2SqrtUnexpanded); + +} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu new file mode 100644 index 000000000..c0ebd6045 --- /dev/null +++ b/cpp/bench/prims/distance/fused_l2_nn.cu @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +namespace raft::bench::distance { + +struct fusedl2nn_inputs { + int64_t m, n, k; +}; // struct fusedl2nn_inputs + +inline auto operator<<(std::ostream& os, const fusedl2nn_inputs& p) -> std::ostream& +{ + os << p.m << "#" << p.n << "#" << p.k; + return os; +} + +template +struct fusedl2nn : public fixture { + fusedl2nn(const fusedl2nn_inputs& p) + : params(p), + workspace(this->handle), + x(this->handle), + y(this->handle), + x_norm(this->handle), + y_norm(this->handle), + out(this->handle) + { + } + + void allocate_data(const ::benchmark::State& state) override + { + x = raft::make_device_matrix(handle, params.m, params.k); + y = raft::make_device_matrix(handle, params.n, params.k); + x_norm = raft::make_device_vector(handle, params.m); + y_norm = raft::make_device_vector(handle, params.n); + out = raft::make_device_vector(handle, params.m); + + raft::random::RngState rng{1234}; + raft::random::uniform( + handle, rng, x.data_handle(), params.m * params.k, (DataT)-1.0, (DataT)1.0); + raft::random::uniform( + handle, rng, y.data_handle(), params.n * params.k, (DataT)-1.0, (DataT)1.0); + + // Pre-compute norms + raft::linalg::rowNorm(x_norm.data_handle(), + x.data_handle(), + params.k, + params.m, + raft::linalg::L2Norm, + true, + stream); + raft::linalg::rowNorm(y_norm.data_handle(), + y.data_handle(), + params.k, + params.n, + raft::linalg::L2Norm, + true, + stream); + resource::sync_stream(handle, stream); + } + + void allocate_temp_buffers(const ::benchmark::State& state) override + { + workspace = raft::make_device_vector(handle, params.m * sizeof(IdxT)); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + raft::distance::fusedL2NNMinReduce(out.data_handle(), + x.data_handle(), + y.data_handle(), + x_norm.data_handle(), + y_norm.data_handle(), + static_cast(params.m), + static_cast(params.n), + static_cast(params.k), + (void*)workspace.data_handle(), + false, + true, + stream); + }); + + int64_t num_flops = 2 * params.m * params.n * params.k; + + int64_t read_elts = params.n * params.k + params.m * params.k; + int64_t write_elts = params.m; + + state.counters["FLOP/s"] = benchmark::Counter( + num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); + + state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + } + + private: + fusedl2nn_inputs params; + raft::device_matrix x, y; + raft::device_vector x_norm, y_norm; + raft::device_vector out; + raft::device_vector workspace; +}; // struct fusedl2nn + +template +std::vector getFusedL2NNInputs() +{ + std::vector inputs; + std::vector m_list = {100000, 1000000}; + if constexpr (sizeof(IdxT) == 8) { m_list.push_back(10000000); } + std::vector n_list = {100, 1000, 10000}; + std::vector k_list = {64, 128, 256}; + for (auto m : m_list) { + for (auto n : n_list) { + for (auto k : k_list) { + inputs.push_back({m, n, k}); + } + } + } + return inputs; +} + +#define FUSEDL2NN_BENCH(DataT, IdxT, OutT) \ + RAFT_BENCH_REGISTER((fusedl2nn), "", getFusedL2NNInputs()) + +FUSEDL2NN_BENCH(float, int, float); +FUSEDL2NN_BENCH(double, int, double); +FUSEDL2NN_BENCH(float, int, (raft::KeyValuePair)); +FUSEDL2NN_BENCH(double, int, (raft::KeyValuePair)); +FUSEDL2NN_BENCH(float, int64_t, float); +FUSEDL2NN_BENCH(double, int64_t, double); +FUSEDL2NN_BENCH(float, int64_t, (raft::KeyValuePair)); +FUSEDL2NN_BENCH(double, int64_t, (raft::KeyValuePair)); + +} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu new file mode 100644 index 000000000..3f7475966 --- /dev/null +++ b/cpp/bench/prims/distance/kernels.cu @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::distance::kernels { + +using namespace raft::distance::kernels; +struct GramTestParams { + int m; // m parameter of the GEMM + int k; // k parameter of the GEMM + int n; // n parameter of the GEMM + KernelParams kernel_params; + bool is_row_major; +}; // struct GramTestParams + +template +struct GramMatrix : public fixture { + GramMatrix(const GramTestParams& p) + : params(p), handle(stream), A(0, stream), B(0, stream), C(0, stream) + { + kernel = std::unique_ptr>( + KernelFactory::create(p.kernel_params, resource::get_cublas_handle(handle))); + + A.resize(params.m * params.k, stream); + B.resize(params.k * params.n, stream); + C.resize(params.m * params.n, stream); + raft::random::RngState rng(123456ULL); + raft::random::uniform(handle, rng, A.data(), params.m * params.k, T(-1.0), T(1.0)); + raft::random::uniform(handle, rng, B.data(), params.k * params.n, T(-1.0), T(1.0)); + } + + ~GramMatrix() + { + A.release(); + B.release(); + C.release(); + } + + void run_benchmark(::benchmark::State& state) override + { + if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); } + loop_on_state(state, [this]() { + (*this->kernel)(A.data(), + this->params.m, + this->params.k, + B.data(), + this->params.n, + C.data(), + this->params.is_row_major, + this->stream); + }); + } + + private: + const raft::device_resources handle; + std::unique_ptr> kernel; + GramTestParams params; + + rmm::device_uvector A; // input matrix A, size [m * k] + rmm::device_uvector B; // input matrix B, size [n * k] + rmm::device_uvector C; // output matrix C, size [m*n] +}; + +static std::vector getInputs() +{ + std::vector param_vec; + std::vector kernel_params{KernelParams{LINEAR, 3, 1, 0}, + KernelParams{POLYNOMIAL, 2, 1.3, 1}, + KernelParams{TANH, 2, 0.5, 2.4}, + KernelParams{RBF, 2, 0.5, 0}}; + struct TestSize { + int m; + int k; + int n; + }; + std::vector data_size{{4096, 10, 1024}, + {4096, 100, 1024}, + {4096, 1000, 1024}, + {4096, 10000, 1024}, + {100000, 10, 1024}, + {100000, 100, 1024}, + {100000, 1000, 1024}}; + + param_vec.reserve(kernel_params.size() * data_size.size()); + for (TestSize s : data_size) { + for (auto kernel : kernel_params) { + for (bool row_major : {false, true}) { + param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major}); + } + } + } + return param_vec; +} + +RAFT_BENCH_REGISTER(GramMatrix, "", getInputs()); +RAFT_BENCH_REGISTER(GramMatrix, "", getInputs()); + +} // namespace raft::bench::distance::kernels diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu new file mode 100644 index 000000000..19d78f4cd --- /dev/null +++ b/cpp/bench/prims/distance/masked_nn.cu @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::distance::masked_nn { + +// Introduce various sparsity patterns +enum AdjacencyPattern { + checkerboard = 0, + checkerboard_4 = 1, + checkerboard_64 = 2, + all_true = 3, + all_false = 4 +}; + +struct Params { + int m, n, k, num_groups; + AdjacencyPattern pattern; +}; // struct Params + +RAFT_KERNEL init_adj(AdjacencyPattern pattern, + int n, + raft::device_matrix_view adj, + raft::device_vector_view group_idxs) +{ + int m = adj.extent(0); + int num_groups = adj.extent(1); + + for (int idx_m = blockIdx.y * blockDim.y + threadIdx.y; idx_m < m; + idx_m += blockDim.y * gridDim.y) { + for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; + idx_g += blockDim.x * gridDim.x) { + switch (pattern) { + case checkerboard: adj(idx_m, idx_g) = (idx_m + idx_g) % 2; break; + case checkerboard_4: adj(idx_m, idx_g) = (idx_m / 4 + idx_g) % 2; break; + case checkerboard_64: adj(idx_m, idx_g) = (idx_m / 64 + idx_g) % 2; break; + case all_true: adj(idx_m, idx_g) = true; break; + case all_false: adj(idx_m, idx_g) = false; break; + default: assert(false && "unknown pattern"); + } + } + } + // Each group is of size n / num_groups. + // + // - group_idxs[j] indicates the start of group j + 1 (i.e. is the inclusive + // scan of the group lengths) + // + // - The first group always starts at index zero, so we do not store it. + // + // - The group_idxs[num_groups - 1] should always equal n. + + if (blockIdx.y == 0 && threadIdx.y == 0) { + const int g_stride = blockDim.x * gridDim.x; + for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; idx_g += g_stride) { + group_idxs(idx_g) = (idx_g + 1) * (n / num_groups); + } + group_idxs(num_groups - 1) = n; + } +} + +template +struct masked_l2_nn : public fixture { + using DataT = T; + using IdxT = int; + using OutT = raft::KeyValuePair; + using RedOpT = raft::distance::MinAndDistanceReduceOp; + using PairRedOpT = raft::distance::KVPMinReduce; + using ParamT = raft::distance::masked_l2_nn_params; + + // Parameters + Params params; + // Data + raft::device_vector out; + raft::device_matrix x, y; + raft::device_vector xn, yn; + raft::device_matrix adj; + raft::device_vector group_idxs; + + masked_l2_nn(const Params& p) + : params(p), + out{raft::make_device_vector(handle, p.m)}, + x{raft::make_device_matrix(handle, p.m, p.k)}, + y{raft::make_device_matrix(handle, p.n, p.k)}, + xn{raft::make_device_vector(handle, p.m)}, + yn{raft::make_device_vector(handle, p.n)}, + adj{raft::make_device_matrix(handle, p.m, p.num_groups)}, + group_idxs{raft::make_device_vector(handle, p.num_groups)} + { + raft::random::RngState r(123456ULL); + + uniform(handle, r, x.data_handle(), p.m * p.k, T(-1.0), T(1.0)); + uniform(handle, r, y.data_handle(), p.n * p.k, T(-1.0), T(1.0)); + raft::linalg::rowNorm( + xn.data_handle(), x.data_handle(), p.k, p.m, raft::linalg::L2Norm, true, stream); + raft::linalg::rowNorm( + yn.data_handle(), y.data_handle(), p.k, p.n, raft::linalg::L2Norm, true, stream); + raft::distance::initialize, int>( + handle, out.data_handle(), p.m, std::numeric_limits::max(), RedOpT{}); + + dim3 block(32, 32); + dim3 grid(10, 10); + init_adj<<>>(p.pattern, p.n, adj.view(), group_idxs.view()); + RAFT_CUDA_TRY(cudaGetLastError()); + } + + void run_benchmark(::benchmark::State& state) override + { + bool init_out = true; + bool sqrt = false; + ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, sqrt, init_out}; + + loop_on_state(state, [this, masked_l2_params]() { + // It is sufficient to only benchmark the L2-squared metric + raft::distance::masked_l2_nn(handle, + masked_l2_params, + x.view(), + y.view(), + xn.view(), + yn.view(), + adj.view(), + group_idxs.view(), + out.view()); + }); + + // Virtual flop count if no skipping had occurred. + size_t virtual_flops = size_t(2) * size_t(params.m) * size_t(params.n) * size_t(params.k); + + int64_t read_elts = params.n * params.k + params.m * params.k; + int64_t write_elts = params.m; + + // Virtual min flops is the number of flops that would have been executed if + // the algorithm had actually skipped each computation that it could have + // skipped. + size_t virtual_min_flops = 0; + switch (params.pattern) { + case checkerboard: + case checkerboard_4: + case checkerboard_64: virtual_min_flops = virtual_flops / 2; break; + case all_true: virtual_min_flops = virtual_flops; break; + case all_false: virtual_min_flops = 0; break; + default: assert(false && "unknown pattern"); + } + + // VFLOP/s is the "virtual" flop count that would have executed if there was + // no adjacency pattern. This is useful for comparing to fusedL2NN + state.counters["VFLOP/s"] = benchmark::Counter(virtual_flops, + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + // Virtual min flops is the number of flops that would have been executed if + // the algorithm had actually skipped each computation that it could have + // skipped. + state.counters["VminFLOP/s"] = benchmark::Counter(virtual_min_flops, + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + + state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + + state.counters["m"] = benchmark::Counter(params.m); + state.counters["n"] = benchmark::Counter(params.n); + state.counters["k"] = benchmark::Counter(params.k); + state.counters["num_groups"] = benchmark::Counter(params.num_groups); + state.counters["group size"] = benchmark::Counter(params.n / params.num_groups); + state.counters["Pat"] = benchmark::Counter(static_cast(params.pattern)); + + state.counters["SM count"] = raft::getMultiProcessorCount(); + } +}; + +const std::vector masked_l2_nn_input_vecs = { + // Very fat matrices... + {32, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + {64, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + {128, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + {256, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + {512, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + {1024, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + {16384, 32, 16384, 32, AdjacencyPattern::checkerboard}, + {16384, 64, 16384, 32, AdjacencyPattern::checkerboard}, + {16384, 128, 16384, 32, AdjacencyPattern::checkerboard}, + {16384, 256, 16384, 32, AdjacencyPattern::checkerboard}, + {16384, 512, 16384, 32, AdjacencyPattern::checkerboard}, + {16384, 1024, 16384, 32, AdjacencyPattern::checkerboard}, + + // Representative matrices... + {16384, 16384, 32, 32, AdjacencyPattern::checkerboard}, + {16384, 16384, 64, 32, AdjacencyPattern::checkerboard}, + {16384, 16384, 128, 32, AdjacencyPattern::checkerboard}, + {16384, 16384, 256, 32, AdjacencyPattern::checkerboard}, + {16384, 16384, 512, 32, AdjacencyPattern::checkerboard}, + {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard}, + {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard}, + + {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_4}, + {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_4}, + {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_4}, + {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_4}, + {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_4}, + {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_4}, + {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_4}, + + {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_64}, + {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_64}, + {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_64}, + {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_64}, + {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_64}, + {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_64}, + {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_64}, + + {16384, 16384, 32, 32, AdjacencyPattern::all_true}, + {16384, 16384, 64, 32, AdjacencyPattern::all_true}, + {16384, 16384, 128, 32, AdjacencyPattern::all_true}, + {16384, 16384, 256, 32, AdjacencyPattern::all_true}, + {16384, 16384, 512, 32, AdjacencyPattern::all_true}, + {16384, 16384, 1024, 32, AdjacencyPattern::all_true}, + {16384, 16384, 16384, 32, AdjacencyPattern::all_true}, + + {16384, 16384, 32, 32, AdjacencyPattern::all_false}, + {16384, 16384, 64, 32, AdjacencyPattern::all_false}, + {16384, 16384, 128, 32, AdjacencyPattern::all_false}, + {16384, 16384, 256, 32, AdjacencyPattern::all_false}, + {16384, 16384, 512, 32, AdjacencyPattern::all_false}, + {16384, 16384, 1024, 32, AdjacencyPattern::all_false}, + {16384, 16384, 16384, 32, AdjacencyPattern::all_false}, +}; + +RAFT_BENCH_REGISTER(masked_l2_nn, "", masked_l2_nn_input_vecs); +// We don't benchmark double to keep compile times in check when not using the +// distance library. + +} // namespace raft::bench::distance::masked_nn diff --git a/cpp/bench/prims/distance/tune_pairwise/bench.cu b/cpp/bench/prims/distance/tune_pairwise/bench.cu new file mode 100644 index 000000000..87159ab1b --- /dev/null +++ b/cpp/bench/prims/distance/tune_pairwise/bench.cu @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Tuning benchmarks. +// +// Goals: +// +// 1. Fast compile times to maintain iteration speed. +// 2. Create benchmarks that can inform the design of the kernels. +// +// Non-goals: +// +// 1. Measure every distance operation. Instead measures just one distance +// operation at the same time. +// 2. Be useful for finding performance regressions. This is handled by the +// normal benchmarks. +// +// So far, both goals are partly achieved. +// +// RE (1), COMPILE TIMES: kernel.cu is fast to compile. This file is not. +// When the internals of a pairwise distance kernel is changed, this file is not +// recompiled. +// +// RE 2, benchmarks with intent: this file contains a benchmark to check the +// maximal throughput of a kernel. Measuring other things, like performance on +// skinny or wide matrices is not yet implemented. + +#include "kernel.cuh" // launch_kernel +#include // std::min +#include // RAFT_BENCH_REGISTER +#include // pairwise_matrix_params +#include // rmm::device_uvector +#include // std::vector + +namespace raft::bench::distance::tune { + +// Max throughput benchmark. +// +// Goal: Measure the maximum distances/sec that can be computed. +// +// To achieve this, we make sure that: +// +// - Input data size is a multiple of the block tile size. +// +// - Perfect distribution of work between SMs, i.e. the number of block tiles is +// a large multiple (num_waves) of the number of blocks (#SMs * occupancy). +// +// - Multiple iterations over Kblk are executed (num_k_iters). +struct throughput_param { + int num_waves; + int occupancy; + int num_k_iters; +}; + +const std::vector throughput_params{ + // 32 waves, requested occupancy of 4, and 32 k iterations typically achieves + // maximum throughput. No need to pick higher values. + {32, 4, 32}, +}; + +struct throughput_bench : public fixture { + const throughput_param p; + + throughput_bench(const throughput_param& p_) : p(p_) {} + + void run_benchmark(::benchmark::State& state) override + { + // Get block size: + int block_m, block_n, block_k; + get_block_size(block_m, block_n, block_k); + + // Determine number of blocks that will be launched. This informs the size + // of the inputs as well as the grid size. + const int num_sms = raft::getMultiProcessorCount(); + const int max_occupancy = get_max_occupancy(); + const int occupancy = std::min(p.occupancy, max_occupancy); + const int num_blocks = occupancy * num_sms; + dim3 grid(num_blocks); + + // Create input sizes that are a multiple of the block tile size. + size_t m = block_m; + size_t n = block_n * p.num_waves * num_blocks; + size_t k = block_k * p.num_k_iters; + + // DataT, OutT, IdxT, etc, are defined in tuned_kernel.cuh + rmm::device_uvector x_vec(m * k, stream); + rmm::device_uvector y_vec(n * k, stream); + rmm::device_uvector x_norm_vec(m, stream); + rmm::device_uvector y_norm_vec(n, stream); + rmm::device_uvector out_vec(m * n, stream); + + auto x = x_vec.data(); + auto y = y_vec.data(); + auto x_norm = x_norm_vec.data(); + auto y_norm = y_norm_vec.data(); + auto out = out_vec.data(); + FinOpT fin_op{}; + + // Create kernel parameter struct. Flip x and y if column major. + IdxT ldx = row_major ? k : m; + IdxT ldy = row_major ? k : n; + IdxT ld_out = row_major ? n : m; + + // Template parameters of pairwise_matrix_params are defined in kernel.cuh + pairwise_matrix_params kparams{ + IdxT(m), IdxT(n), IdxT(k), ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, row_major}; + + // Run benchmark + loop_on_state(state, [&]() { launch_kernel(kparams, grid, stream); }); + + // Report metrics. We don't report flop/s because we do not know for each + // distance operation how many flops it costs. For L2_unexp and l1, we can + // double this number to get the flop/s. For l2 expanded, core_ops/s should + // equal flop/s (modulo the sqrt and subtracting from the norm). + size_t num_core_ops = m * n * k; + size_t read_elts = n * k + m * k; + size_t write_elts = m * n; + + state.counters["m"] = benchmark::Counter(m); + state.counters["n"] = benchmark::Counter(n); + state.counters["k"] = benchmark::Counter(k); + state.counters["occupancy"] = benchmark::Counter(occupancy); + state.counters["# waves"] = benchmark::Counter(p.num_waves); + state.counters["# k iters"] = benchmark::Counter(p.num_k_iters); + + state.counters["core_ops/s"] = benchmark::Counter(num_core_ops, + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + + state.counters["BW"] = benchmark::Counter(write_elts * sizeof(OutT) + read_elts * sizeof(DataT), + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1000); + } +}; + +RAFT_BENCH_REGISTER(throughput_bench, "", throughput_params); + +} // namespace raft::bench::distance::tune diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cu b/cpp/bench/prims/distance/tune_pairwise/kernel.cu new file mode 100644 index 000000000..3112e1ea9 --- /dev/null +++ b/cpp/bench/prims/distance/tune_pairwise/kernel.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel.cuh" +#include // pairwise_matrix_sm60_wrapper +#include // raft::linalg::Policy4x4 +#include // raft::util::arch::SM_compute_arch + +namespace raft::bench::distance::tune { + +// Distance op +using OpT = raft::distance::detail::ops::lp_unexp_distance_op; +constexpr float metric_arg = 2.0; +OpT distance_op{metric_arg}; + +// Kernel policy +constexpr int vec_len = 1; +using Policy = typename raft::linalg::Policy4x4::Policy; + +// Architecture +namespace arch = raft::util::arch; +constexpr auto sm_compat_range = arch::SM_range(arch::SM_min(), arch::SM_future()); + +void launch_kernel(pairwise_matrix_params params, dim3 grid, cudaStream_t stream) +{ + dim3 block(Policy::Nthreads); + int smem_size = OpT::shared_mem_size(); + + // Obtain function pointer to kernel + auto kernel = raft::distance::detail::pairwise_matrix_kernel; + + kernel<<>>(distance_op, params); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +void get_block_size(int& m, int& n, int& k) +{ + m = Policy::Mblk; + n = Policy::Nblk; + k = Policy::Kblk; +} + +void* get_kernel_ptr() +{ + auto kernel = raft::distance::detail::pairwise_matrix_kernel; + return reinterpret_cast(kernel); +} + +int get_max_occupancy() +{ + void* kernel_ptr = get_kernel_ptr(); + int max_occupancy; + int smem_size = OpT::shared_mem_size(); + + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_occupancy, kernel_ptr, Policy::Nthreads, smem_size)); + + return max_occupancy; +} + +} // namespace raft::bench::distance::tune diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cuh b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh new file mode 100644 index 000000000..5da54a343 --- /dev/null +++ b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include // lp_unexp_distance_op +#include // pairwise_matrix_params + +namespace raft::bench::distance::tune { + +// Launch one specific kernel with the following template parameters +constexpr bool row_major = true; +using DataT = float; +using AccT = float; +using OutT = DataT; +using IdxT = int; + +using FinOpT = raft::identity_op; + +using pairwise_matrix_params = + raft::distance::detail::pairwise_matrix_params; + +// Launches kernel +void launch_kernel(pairwise_matrix_params, dim3, cudaStream_t); + +// Describes the block size that is decided by the policy +void get_block_size(int& m, int& n, int& k); + +int get_max_occupancy(); + +} // namespace raft::bench::distance::tune diff --git a/cpp/bench/prims/linalg/add.cu b/cpp/bench/prims/linalg/add.cu new file mode 100644 index 000000000..456214ad7 --- /dev/null +++ b/cpp/bench/prims/linalg/add.cu @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft::bench::linalg { + +struct add_inputs { + int len; +}; // struct add_inputs + +template +struct add : public fixture { + add(const add_inputs& p) : params(p), ptr0(p.len, stream), ptr1(p.len, stream) {} + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::linalg::add(ptr0.data(), ptr0.data(), ptr1.data(), params.len, stream); + }); + } + + private: + add_inputs params; + rmm::device_uvector ptr0, ptr1; +}; // struct add + +const std::vector add_input_vecs{ + {256 * 1024 * 1024}, {256 * 1024 * 1024 + 2}, {256 * 1024 * 1024 + 1} + +}; + +RAFT_BENCH_REGISTER(add, "", add_input_vecs); +RAFT_BENCH_REGISTER(add, "", add_input_vecs); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/map_then_reduce.cu b/cpp/bench/prims/linalg/map_then_reduce.cu new file mode 100644 index 000000000..84aebd85b --- /dev/null +++ b/cpp/bench/prims/linalg/map_then_reduce.cu @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft::bench::linalg { + +struct map_then_reduce_inputs { + int len; +}; + +template +struct Identity { + HDI Type operator()(Type a) { return a; } +}; + +template +struct map_then_reduce : public fixture { + map_then_reduce(const map_then_reduce_inputs& p) : params(p), in(p.len, stream), out(1, stream) {} + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::linalg::mapThenSumReduce(out.data(), params.len, Identity(), stream, in.data()); + }); + } + + private: + map_then_reduce_inputs params; + rmm::device_uvector out, in; +}; // struct MapThenReduce + +const std::vector map_then_reduce_input_vecs{ + {1024 * 1024}, + {32 * 1024 * 1024}, + {1024 * 1024 * 1024}, + {1024 * 1024 + 2}, + {32 * 1024 * 1024 + 2}, + {1024 * 1024 * 1024 + 2}, + {1024 * 1024 + 1}, + {32 * 1024 * 1024 + 1}, + {1024 * 1024 * 1024 + 1}, + +}; + +RAFT_BENCH_REGISTER(map_then_reduce, "", map_then_reduce_input_vecs); +RAFT_BENCH_REGISTER(map_then_reduce, "", map_then_reduce_input_vecs); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/matrix_vector_op.cu b/cpp/bench/prims/linalg/matrix_vector_op.cu new file mode 100644 index 000000000..d1fbaee79 --- /dev/null +++ b/cpp/bench/prims/linalg/matrix_vector_op.cu @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft::bench::linalg { + +template +struct mat_vec_op_inputs { + IdxT rows, cols; + bool rowMajor, bcastAlongRows; + IdxT inAlignOffset, outAlignOffset; +}; // struct mat_vec_op_inputs + +template +inline auto operator<<(std::ostream& os, const mat_vec_op_inputs& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.rowMajor << "#" << p.bcastAlongRows << "#" + << p.inAlignOffset << "#" << p.outAlignOffset; + return os; +} + +template +struct mat_vec_op : public fixture { + mat_vec_op(const mat_vec_op_inputs& p) + : params(p), + out(p.rows * p.cols + params.outAlignOffset, stream), + in(p.rows * p.cols + params.inAlignOffset, stream), + vec1(p.bcastAlongRows ? p.cols : p.rows, stream), + vec2(p.bcastAlongRows ? p.cols : p.rows, stream) + { + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + if constexpr (OpT::useTwoVectors) { + raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset, + in.data() + params.inAlignOffset, + vec1.data(), + vec2.data(), + params.cols, + params.rows, + params.rowMajor, + params.bcastAlongRows, + OpT{}, + stream); + } else { + raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset, + in.data() + params.inAlignOffset, + vec1.data(), + params.cols, + params.rows, + params.rowMajor, + params.bcastAlongRows, + OpT{}, + stream); + } + }); + } + + private: + mat_vec_op_inputs params; + rmm::device_uvector out, in, vec1, vec2; +}; // struct MatVecOp + +template +std::vector> get_mv_inputs() +{ + std::vector> out; + + // Scalability benchmark with round dimensions + std::vector rows = {1000, 100000, 1000000}; + std::vector cols = {8, 64, 256, 1024}; + for (bool rowMajor : {true, false}) { + for (bool alongRows : {true, false}) { + for (IdxT rows_ : rows) { + for (IdxT cols_ : cols) { + out.push_back({rows_, cols_, rowMajor, alongRows, 0, 0}); + } + } + } + } + + // Odd dimensions, misalignment + std::vector> rowcols = { + {44739207, 7}, + {44739207, 15}, + {44739207, 16}, + {44739207, 17}, + {2611236, 256}, + {2611236, 257}, + {2611236, 263}, + }; + for (bool rowMajor : {true, false}) { + for (bool alongRows : {true, false}) { + for (auto rc : rowcols) { + for (IdxT inAlignOffset : {0, 1}) { + for (IdxT outAlignOffset : {0, 1}) { + out.push_back({std::get<0>(rc), + std::get<1>(rc), + rowMajor, + alongRows, + inAlignOffset, + outAlignOffset}); + } + } + } + } + } + return out; +} + +const std::vector> mv_input_i32 = get_mv_inputs(); +const std::vector> mv_input_i64 = get_mv_inputs(); + +template +struct Add1Vec { + static constexpr bool useTwoVectors = false; + HDI T operator()(T a, T b) const { return a + b; }; +}; +template +struct Add2Vec { + static constexpr bool useTwoVectors = true; + HDI T operator()(T a, T b, T c) const { return a + b + c; }; +}; + +RAFT_BENCH_REGISTER((mat_vec_op, float, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, double, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, float, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, double, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, float, int64_t>), "", mv_input_i64); +RAFT_BENCH_REGISTER((mat_vec_op, double, int64_t>), "", mv_input_i64); +RAFT_BENCH_REGISTER((mat_vec_op, float, int64_t>), "", mv_input_i64); +RAFT_BENCH_REGISTER((mat_vec_op, double, int64_t>), "", mv_input_i64); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu new file mode 100644 index 000000000..1db23e4ca --- /dev/null +++ b/cpp/bench/prims/linalg/norm.cu @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::bench::linalg { + +template +struct norm_input { + IdxT rows, cols; +}; + +template +inline auto operator<<(std::ostream& os, const norm_input& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols; + return os; +} + +template +struct rowNorm : public fixture { + rowNorm(const norm_input& p) : params(p), in(p.rows * p.cols, stream), dots(p.rows, stream) + { + raft::random::RngState rng{1234}; + raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + auto input_view = raft::make_device_matrix_view( + in.data(), params.rows, params.cols); + auto output_view = + raft::make_device_vector_view(dots.data(), params.rows); + raft::linalg::norm(handle, + input_view, + output_view, + raft::linalg::L2Norm, + raft::linalg::Apply::ALONG_ROWS, + raft::sqrt_op()); + }); + } + + private: + norm_input params; + rmm::device_uvector in, dots; +}; // struct rowNorm + +const std::vector> norm_inputs_i32 = + raft::util::itertools::product>({10, 100, 1000, 10000, 100000}, + {16, 32, 64, 128, 256, 512, 1024}); +const std::vector> norm_inputs_i64 = + raft::util::itertools::product>({10, 100, 1000, 10000, 100000}, + {16, 32, 64, 128, 256, 512, 1024}); + +RAFT_BENCH_REGISTER((rowNorm), "", norm_inputs_i32); +RAFT_BENCH_REGISTER((rowNorm), "", norm_inputs_i32); +RAFT_BENCH_REGISTER((rowNorm), "", norm_inputs_i64); +RAFT_BENCH_REGISTER((rowNorm), "", norm_inputs_i64); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu new file mode 100644 index 000000000..91319e774 --- /dev/null +++ b/cpp/bench/prims/linalg/normalize.cu @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft::bench::linalg { + +template +struct normalize_input { + IdxT rows, cols; +}; + +template +inline auto operator<<(std::ostream& os, const normalize_input& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols; + return os; +} + +template +struct rowNormalize : public fixture { + rowNormalize(const normalize_input& p) + : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream) + { + raft::random::RngState rng{1234}; + raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + auto input_view = raft::make_device_matrix_view( + in.data(), params.rows, params.cols); + auto output_view = raft::make_device_matrix_view( + out.data(), params.rows, params.cols); + raft::linalg::row_normalize(handle, input_view, output_view, raft::linalg::L2Norm); + }); + } + + private: + normalize_input params; + rmm::device_uvector in, out; +}; // struct rowNormalize + +const std::vector> normalize_inputs_i32 = + raft::util::itertools::product>( + {10, 100, 1000, 10000, 100000}, {8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}); +const std::vector> normalize_inputs_i64 = + raft::util::itertools::product>( + {10, 100, 1000, 10000, 100000}, {8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}); + +RAFT_BENCH_REGISTER((rowNormalize), "", normalize_inputs_i32); +RAFT_BENCH_REGISTER((rowNormalize), "", normalize_inputs_i32); +RAFT_BENCH_REGISTER((rowNormalize), "", normalize_inputs_i64); +RAFT_BENCH_REGISTER((rowNormalize), "", normalize_inputs_i64); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/reduce.cu b/cpp/bench/prims/linalg/reduce.cu new file mode 100644 index 000000000..cf41c5916 --- /dev/null +++ b/cpp/bench/prims/linalg/reduce.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +namespace raft::bench::linalg { + +struct input_size { + int rows, cols; + bool along_rows; +}; + +template +struct reduce : public fixture { + reduce(bool along_rows, const input_size& p) + : input_size(p), along_rows(along_rows), in(p.rows * p.cols, stream), out(p.rows, stream) + { + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::linalg::reduce( + out.data(), in.data(), input_size.cols, input_size.rows, T(0.f), true, along_rows, stream); + }); + } + + private: + bool along_rows; + input_size input_size; + rmm::device_uvector in, out; +}; // struct reduce + +const std::vector kInputSizes{{8 * 1024, 1024}, + {1024, 8 * 1024}, + {8 * 1024, 8 * 1024}, + {32 * 1024, 1024}, + {1024, 32 * 1024}, + {32 * 1024, 32 * 1024}}; + +const std::vector kAlongRows{false, true}; + +RAFT_BENCH_REGISTER(reduce, "", kAlongRows, kInputSizes); +RAFT_BENCH_REGISTER(reduce, "", kAlongRows, kInputSizes); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu new file mode 100644 index 000000000..1b584e80c --- /dev/null +++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft::bench::linalg { + +template +struct rcbk_params { + IdxT rows, cols; + IdxT keys; +}; + +template +inline auto operator<<(std::ostream& os, const rcbk_params& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.keys; + return os; +} + +template +struct reduce_cols_by_key : public fixture { + reduce_cols_by_key(const rcbk_params& p) + : params(p), in(p.rows * p.cols, stream), out(p.rows * p.keys, stream), keys(p.cols, stream) + { + raft::random::RngState rng{42}; + raft::random::uniformInt(handle, rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + raft::linalg::reduce_cols_by_key( + in.data(), keys.data(), out.data(), params.rows, params.cols, params.keys, stream, false); + }); + } + + protected: + rcbk_params params; + rmm::device_uvector in, out; + rmm::device_uvector keys; +}; // struct reduce_cols_by_key + +const std::vector> rcbk_inputs_i32 = + raft::util::itertools::product>( + {1, 10, 100, 1000}, {1000, 10000, 100000}, {8, 32, 128, 512, 2048}); +const std::vector> rcbk_inputs_i64 = + raft::util::itertools::product>( + {1, 10, 100, 1000}, {1000, 10000, 100000}, {8, 32, 128, 512, 2048}); + +RAFT_BENCH_REGISTER((reduce_cols_by_key), "", rcbk_inputs_i32); +RAFT_BENCH_REGISTER((reduce_cols_by_key), "", rcbk_inputs_i32); +RAFT_BENCH_REGISTER((reduce_cols_by_key), "", rcbk_inputs_i64); +RAFT_BENCH_REGISTER((reduce_cols_by_key), "", rcbk_inputs_i64); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu new file mode 100644 index 000000000..b68cefc27 --- /dev/null +++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace raft::bench::linalg { + +struct rrbk_params { + int64_t rows, cols; + int64_t keys; +}; + +template +struct reduce_rows_by_key : public fixture { + reduce_rows_by_key(const rrbk_params& p) + : params(p), + in(p.rows * p.cols, stream), + out(p.keys * p.cols, stream), + keys(p.rows, stream), + workspace(p.rows, stream) + { + raft::random::RngState rng{42}; + raft::random::uniformInt(handle, rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::linalg::reduce_rows_by_key(in.data(), + params.cols, + keys.data(), + workspace.data(), + params.rows, + params.cols, + params.keys, + out.data(), + stream, + false); + }); + } + + protected: + rrbk_params params; + rmm::device_uvector in, out; + rmm::device_uvector keys; + rmm::device_uvector workspace; +}; // struct reduce_rows_by_key + +const std::vector kInputSizes{ + {10000, 128, 64}, + {100000, 128, 64}, + {1000000, 128, 64}, + {10000000, 128, 64}, + {10000, 128, 256}, + {100000, 128, 256}, + {1000000, 128, 256}, + {10000000, 128, 256}, + {10000, 128, 1024}, + {100000, 128, 1024}, + {1000000, 128, 1024}, + {10000000, 128, 1024}, + {10000, 128, 4096}, + {100000, 128, 4096}, + {1000000, 128, 4096}, + {10000000, 128, 4096}, +}; + +RAFT_BENCH_REGISTER((reduce_rows_by_key), "", kInputSizes); +RAFT_BENCH_REGISTER((reduce_rows_by_key), "", kInputSizes); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/main.cpp b/cpp/bench/prims/main.cpp new file mode 100644 index 000000000..40f539fac --- /dev/null +++ b/cpp/bench/prims/main.cpp @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include // NOLINT + +BENCHMARK_MAIN(); diff --git a/cpp/bench/prims/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu new file mode 100644 index 000000000..afee81aa0 --- /dev/null +++ b/cpp/bench/prims/matrix/argmin.cu @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace raft::bench::matrix { + +template +struct ArgminParams { + IdxT rows, cols; +}; + +template +struct Argmin : public fixture { + Argmin(const ArgminParams& p) : params(p), matrix(this->handle), indices(this->handle) {} + + void allocate_data(const ::benchmark::State& state) override + { + matrix = raft::make_device_matrix(handle, params.rows, params.cols); + indices = raft::make_device_vector(handle, params.rows); + + raft::random::RngState rng{1234}; + raft::random::uniform( + handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1)); + resource::sync_stream(handle, stream); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::matrix::argmin(handle, raft::make_const_mdspan(matrix.view()), indices.view()); + }); + } + + private: + ArgminParams params; + raft::device_matrix matrix; + raft::device_vector indices; +}; // struct Argmin + +const std::vector> argmin_inputs_i64 = + raft::util::itertools::product>({1000, 10000, 100000, 1000000, 10000000}, + {64, 128, 256, 512, 1024}); + +RAFT_BENCH_REGISTER((Argmin), "", argmin_inputs_i64); +RAFT_BENCH_REGISTER((Argmin), "", argmin_inputs_i64); + +} // namespace raft::bench::matrix diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu new file mode 100644 index 000000000..00a145ffa --- /dev/null +++ b/cpp/bench/prims/matrix/gather.cu @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace raft::bench::matrix { + +template +struct GatherParams { + IdxT rows, cols, map_length; +}; + +template +inline auto operator<<(std::ostream& os, const GatherParams& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.map_length; + return os; +} + +template +struct Gather : public fixture { + Gather(const GatherParams& p) + : params(p), matrix(this->handle), map(this->handle), out(this->handle), stencil(this->handle) + { + } + + void allocate_data(const ::benchmark::State& state) override + { + matrix = raft::make_device_matrix(handle, params.rows, params.cols); + map = raft::make_device_vector(handle, params.map_length); + out = raft::make_device_matrix(handle, params.map_length, params.cols); + stencil = raft::make_device_vector(handle, Conditional ? params.map_length : IdxT(0)); + + raft::random::RngState rng{1234}; + raft::random::uniform( + handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1)); + raft::random::uniformInt( + handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows); + if constexpr (Conditional) { + raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1)); + } + resource::sync_stream(handle, stream); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + auto matrix_const_view = raft::make_const_mdspan(matrix.view()); + auto map_const_view = raft::make_const_mdspan(map.view()); + if constexpr (Conditional) { + auto stencil_const_view = raft::make_const_mdspan(stencil.view()); + auto pred_op = raft::plug_const_op(T(0.0), raft::greater_op()); + raft::matrix::gather_if( + handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op); + } else { + raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view()); + } + }); + } + + private: + GatherParams params; + raft::device_matrix matrix, out; + raft::device_vector stencil; + raft::device_vector map; +}; // struct Gather + +template +using GatherIf = Gather; + +const std::vector> gather_inputs_i64 = + raft::util::itertools::product>( + {1000000}, {10, 20, 50, 100, 200, 500}, {1000, 10000, 100000, 1000000}); + +RAFT_BENCH_REGISTER((Gather), "", gather_inputs_i64); +RAFT_BENCH_REGISTER((Gather), "", gather_inputs_i64); +RAFT_BENCH_REGISTER((GatherIf), "", gather_inputs_i64); +RAFT_BENCH_REGISTER((GatherIf), "", gather_inputs_i64); +} // namespace raft::bench::matrix diff --git a/cpp/bench/prims/matrix/main.cpp b/cpp/bench/prims/matrix/main.cpp new file mode 100644 index 000000000..9cdb1c254 --- /dev/null +++ b/cpp/bench/prims/matrix/main.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft::matrix { +void add_select_k_dataset_benchmarks(); +} + +int main(int argc, char** argv) +{ + // if we're passed a 'select_k_dataset' flag, add in extra benchmarks + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--select_k_dataset") == 0) { + raft::matrix::add_select_k_dataset_benchmarks(); + + // pop off the cmdline argument from argc/argv + for (int j = i; j < argc - 1; ++j) + argv[j] = argv[j + 1]; + argc--; + break; + } + } + benchmark::Initialize(&argc, argv); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + benchmark::RunSpecifiedBenchmarks(); +} diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu new file mode 100644 index 000000000..d3994e59c --- /dev/null +++ b/cpp/bench/prims/matrix/select_k.cu @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace raft::matrix { +using namespace raft::bench; // NOLINT + +template +struct replace_with_mask { + KeyT replacement; + int64_t line_length; + int64_t spared_inputs; + constexpr auto inline operator()(int64_t offset, KeyT x, uint8_t mask) -> KeyT + { + auto i = offset % line_length; + // don't replace all the inputs, spare a few elements at the beginning of the input + return (mask && i >= spared_inputs) ? replacement : x; + } +}; + +template +struct selection : public fixture { + explicit selection(const select::params& p) + : fixture(p.use_memory_pool), + params_(p), + in_dists_(p.batch_size * p.len, stream), + in_ids_(p.batch_size * p.len, stream), + out_dists_(p.batch_size * p.k, stream), + out_ids_(p.batch_size * p.k, stream) + { + raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream); + raft::random::RngState state{42}; + + KeyT min_value = -1.0; + KeyT max_value = 1.0; + if (p.use_same_leading_bits) { + if constexpr (std::is_same_v) { + uint32_t min_bits = 0x3F800000; // 1.0 + uint32_t max_bits = 0x3F8000FF; // 1.00003 + memcpy(&min_value, &min_bits, sizeof(KeyT)); + memcpy(&max_value, &max_bits, sizeof(KeyT)); + } else if constexpr (std::is_same_v) { + uint64_t min_bits = 0x3FF0000000000000; // 1.0 + uint64_t max_bits = 0x3FF0000FFFFFFFFF; // 1.000015 + memcpy(&min_value, &min_bits, sizeof(KeyT)); + memcpy(&max_value, &max_bits, sizeof(KeyT)); + } + } + raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), min_value, max_value); + if (p.frac_infinities > 0.0) { + rmm::device_uvector mask_buf(p.batch_size * p.len, stream); + auto mask = make_device_vector_view(mask_buf.data(), mask_buf.size()); + raft::random::bernoulli(handle, state, mask, p.frac_infinities); + KeyT bound = p.select_min ? raft::upper_bound() : raft::lower_bound(); + auto mask_in = + make_device_vector_view(mask_buf.data(), mask_buf.size()); + auto dists_in = make_device_vector_view(in_dists_.data(), in_dists_.size()); + auto dists_out = make_device_vector_view(in_dists_.data(), in_dists_.size()); + raft::linalg::map_offset(handle, + dists_out, + replace_with_mask{bound, int64_t(p.len), int64_t(p.k / 2)}, + dists_in, + mask_in); + } + } + + void run_benchmark(::benchmark::State& state) override // NOLINT + { + try { + std::ostringstream label_stream; + label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k; + if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; } + if (params_.frac_infinities > 0) { label_stream << "#infs-" << params_.frac_infinities; } + state.SetLabel(label_stream.str()); + common::nvtx::range case_scope("%s - %s", state.name().c_str(), label_stream.str().c_str()); + int iter = 0; + loop_on_state(state, [&iter, this]() { + common::nvtx::range lap_scope("lap-", iter++); + select::select_k_impl(handle, + Algo, + in_dists_.data(), + params_.use_index_input ? in_ids_.data() : NULL, + params_.batch_size, + params_.len, + params_.k, + out_dists_.data(), + out_ids_.data(), + params_.select_min); + }); + } catch (raft::exception& e) { + state.SkipWithError(e.what()); + } + } + + private: + const select::params params_; + rmm::device_uvector in_dists_, out_dists_; + rmm::device_uvector in_ids_, out_ids_; +}; + +const std::vector kInputs{ + {20000, 500, 1, true}, + {20000, 500, 2, true}, + {20000, 500, 4, true}, + {20000, 500, 8, true}, + {20000, 500, 16, true}, + {20000, 500, 32, true}, + {20000, 500, 64, true}, + {20000, 500, 128, true}, + {20000, 500, 256, true}, + + {1000, 10000, 1, true}, + {1000, 10000, 2, true}, + {1000, 10000, 4, true}, + {1000, 10000, 8, true}, + {1000, 10000, 16, true}, + {1000, 10000, 32, true}, + {1000, 10000, 64, true}, + {1000, 10000, 128, true}, + {1000, 10000, 256, true}, + + {100, 100000, 1, true}, + {100, 100000, 2, true}, + {100, 100000, 4, true}, + {100, 100000, 8, true}, + {100, 100000, 16, true}, + {100, 100000, 32, true}, + {100, 100000, 64, true}, + {100, 100000, 128, true}, + {100, 100000, 256, true}, + + {10, 1000000, 1, true}, + {10, 1000000, 2, true}, + {10, 1000000, 4, true}, + {10, 1000000, 8, true}, + {10, 1000000, 16, true}, + {10, 1000000, 32, true}, + {10, 1000000, 64, true}, + {10, 1000000, 128, true}, + {10, 1000000, 256, true}, + + {10, 1000000, 1, true, false, true}, + {10, 1000000, 2, true, false, true}, + {10, 1000000, 4, true, false, true}, + {10, 1000000, 8, true, false, true}, + {10, 1000000, 16, true, false, true}, + {10, 1000000, 32, true, false, true}, + {10, 1000000, 64, true, false, true}, + {10, 1000000, 128, true, false, true}, + {10, 1000000, 256, true, false, true}, + + {10, 1000000, 1, true, false, false, true, 0.1}, + {10, 1000000, 16, true, false, false, true, 0.1}, + {10, 1000000, 64, true, false, false, true, 0.1}, + {10, 1000000, 128, true, false, false, true, 0.1}, + {10, 1000000, 256, true, false, false, true, 0.1}, + + {10, 1000000, 1, true, false, false, true, 0.9}, + {10, 1000000, 16, true, false, false, true, 0.9}, + {10, 1000000, 64, true, false, false, true, 0.9}, + {10, 1000000, 128, true, false, false, true, 0.9}, + {10, 1000000, 256, true, false, false, true, 0.9}, + {1000, 10000, 1, true, false, false, true, 0.9}, + {1000, 10000, 16, true, false, false, true, 0.9}, + {1000, 10000, 64, true, false, false, true, 0.9}, + {1000, 10000, 128, true, false, false, true, 0.9}, + {1000, 10000, 256, true, false, false, true, 0.9}, + + {10, 1000000, 1, true, false, false, true, 1.0}, + {10, 1000000, 16, true, false, false, true, 1.0}, + {10, 1000000, 64, true, false, false, true, 1.0}, + {10, 1000000, 128, true, false, false, true, 1.0}, + {10, 1000000, 256, true, false, false, true, 1.0}, + {1000, 10000, 1, true, false, false, true, 1.0}, + {1000, 10000, 16, true, false, false, true, 1.0}, + {1000, 10000, 64, true, false, false, true, 1.0}, + {1000, 10000, 128, true, false, false, true, 1.0}, + {1000, 10000, 256, true, false, false, true, 1.0}, + {1000, 10000, 256, true, false, false, true, 0.999}, +}; + +#define SELECTION_REGISTER(KeyT, IdxT, A) \ + namespace BENCHMARK_PRIVATE_NAME(selection) { \ + using SelectK = selection; \ + RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \ + } + +SELECTION_REGISTER(float, uint32_t, kPublicApi); // NOLINT +SELECTION_REGISTER(float, uint32_t, kRadix8bits); // NOLINT +SELECTION_REGISTER(float, uint32_t, kRadix11bits); // NOLINT +SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass); // NOLINT +SELECTION_REGISTER(float, uint32_t, kWarpAuto); // NOLINT +SELECTION_REGISTER(float, uint32_t, kWarpImmediate); // NOLINT +SELECTION_REGISTER(float, uint32_t, kWarpFiltered); // NOLINT +SELECTION_REGISTER(float, uint32_t, kWarpDistributed); // NOLINT +SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm); // NOLINT + +SELECTION_REGISTER(double, uint32_t, kRadix8bits); // NOLINT +SELECTION_REGISTER(double, uint32_t, kRadix11bits); // NOLINT +SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass); // NOLINT +SELECTION_REGISTER(double, uint32_t, kWarpAuto); // NOLINT + +SELECTION_REGISTER(double, int64_t, kRadix8bits); // NOLINT +SELECTION_REGISTER(double, int64_t, kRadix11bits); // NOLINT +SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass); // NOLINT +SELECTION_REGISTER(double, int64_t, kWarpImmediate); // NOLINT +SELECTION_REGISTER(double, int64_t, kWarpFiltered); // NOLINT +SELECTION_REGISTER(double, int64_t, kWarpDistributed); // NOLINT +SELECTION_REGISTER(double, int64_t, kWarpDistributedShm); // NOLINT + +// For learning a heuristic of which selection algorithm to use, we +// have a couple of additional constraints when generating the dataset: +// 1. We want these benchmarks to be optionally enabled from the commandline - +// there are thousands of them, and the run-time is non-trivial. This should be opt-in only +// 2. We test out larger k values - that won't work for all algorithms. This requires filtering +// the input parameters per algorithm. +// This makes the code to generate this dataset different from the code above to +// register other benchmarks +#define SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, A, input) \ + { \ + using SelectK = selection; \ + std::stringstream name; \ + name << "SelectKDataset/" << #KeyT "/" #IdxT "/" #A << "/" << input.batch_size << "/" \ + << input.len << "/" << input.k << "/" << input.use_index_input << "/" \ + << input.use_memory_pool; \ + auto* b = ::benchmark::internal::RegisterBenchmarkInternal( \ + new raft::bench::internal::Fixture(name.str(), input)); \ + b->UseManualTime(); \ + b->Unit(benchmark::kMillisecond); \ + } + +const static size_t MAX_MEMORY = 16 * 1024 * 1024 * 1024ULL; + +// registers the input for all algorithms +#define SELECTION_REGISTER_INPUT(KeyT, IdxT, input) \ + { \ + size_t mem = input.batch_size * input.len * (sizeof(KeyT) + sizeof(IdxT)); \ + if (mem < MAX_MEMORY) { \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kRadix8bits, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kRadix11bits, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kRadix11bitsExtraPass, input) \ + if (input.k <= raft::matrix::detail::select::warpsort::kMaxCapacity) { \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpImmediate, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpFiltered, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpDistributed, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpDistributedShm, input) \ + } \ + if (input.k <= raft::neighbors::detail::kFaissMaxK()) { \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kFaissBlockSelect, input) \ + } \ + } \ + } + +void add_select_k_dataset_benchmarks() +{ + // define a uniform grid + std::vector inputs; + + size_t grid_increment = 1; + std::vector k_vals; + for (size_t k = 0; k < 13; k += grid_increment) { + k_vals.push_back(1 << k); + } + // Add in values just past the limit for warp/faiss select + k_vals.push_back(257); + k_vals.push_back(2049); + + const static bool select_min = true; + const static bool use_ids = false; + + for (size_t row = 0; row < 13; row += grid_increment) { + for (size_t col = 10; col < 28; col += grid_increment) { + for (auto k : k_vals) { + inputs.push_back( + select::params{size_t(1 << row), size_t(1 << col), k, select_min, use_ids}); + } + } + } + + // also add in some random values + std::default_random_engine rng(42); + std::uniform_real_distribution<> row_dist(0, 13); + std::uniform_real_distribution<> col_dist(10, 28); + std::uniform_real_distribution<> k_dist(0, 13); + for (size_t i = 0; i < 1024; ++i) { + auto row = static_cast(pow(2, row_dist(rng))); + auto col = static_cast(pow(2, col_dist(rng))); + auto k = static_cast(pow(2, k_dist(rng))); + inputs.push_back(select::params{row, col, k, select_min, use_ids}); + } + + for (auto& input : inputs) { + SELECTION_REGISTER_INPUT(double, int64_t, input); + SELECTION_REGISTER_INPUT(double, uint32_t, input); + SELECTION_REGISTER_INPUT(float, int64_t, input); + SELECTION_REGISTER_INPUT(float, uint32_t, input); + } + + // also try again without a memory pool to see if there are significant differences + for (auto input : inputs) { + input.use_memory_pool = false; + SELECTION_REGISTER_INPUT(double, int64_t, input); + SELECTION_REGISTER_INPUT(double, uint32_t, input); + SELECTION_REGISTER_INPUT(float, int64_t, input); + SELECTION_REGISTER_INPUT(float, uint32_t, input); + } +} +} // namespace raft::matrix diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh new file mode 100644 index 000000000..07e93a347 --- /dev/null +++ b/cpp/bench/prims/neighbors/cagra_bench.cuh @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::bench::neighbors { + +struct params { + /** Size of the dataset. */ + size_t n_samples; + /** Number of dimensions in the dataset. */ + int n_dims; + /** The batch size -- number of KNN searches. */ + int n_queries; + /** Number of nearest neighbours to find for every probe. */ + int k; + /** kNN graph degree*/ + int degree; + int itopk_size; + int block_size; + int search_width; + int max_iterations; + /** Ratio of removed indices. */ + double removed_ratio; +}; + +template +struct CagraBench : public fixture { + explicit CagraBench(const params& ps) + : fixture(true), + params_(ps), + queries_(make_device_matrix(handle, ps.n_queries, ps.n_dims)), + dataset_(make_device_matrix(handle, ps.n_samples, ps.n_dims)), + knn_graph_(make_device_matrix(handle, ps.n_samples, ps.degree)), + removed_indices_bitset_(handle, ps.n_samples) + { + // Generate random dataset and queriees + raft::random::RngState state{42}; + constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); + constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); + if constexpr (std::is_integral_v) { + raft::random::uniformInt( + handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax); + raft::random::uniformInt( + handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax); + } else { + raft::random::uniform( + handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax); + raft::random::uniform( + handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax); + } + + // Generate random knn graph + + raft::random::uniformInt( + handle, state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1); + + auto metric = raft::distance::DistanceType::L2Expanded; + + auto removed_indices = + raft::make_device_vector(handle, ps.removed_ratio * ps.n_samples); + thrust::sequence( + resource::get_thrust_policy(handle), + thrust::device_pointer_cast(removed_indices.data_handle()), + thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0))); + removed_indices_bitset_.set(handle, removed_indices.view()); + index_.emplace(raft::neighbors::cagra::index( + handle, metric, make_const_mdspan(dataset_.view()), make_const_mdspan(knn_graph_.view()))); + } + + void run_benchmark(::benchmark::State& state) override + { + raft::neighbors::cagra::search_params search_params; + search_params.max_queries = 1024; + search_params.itopk_size = params_.itopk_size; + search_params.team_size = 0; + search_params.thread_block_size = params_.block_size; + search_params.search_width = params_.search_width; + + auto indices = make_device_matrix(handle, params_.n_queries, params_.k); + auto distances = make_device_matrix(handle, params_.n_queries, params_.k); + auto ind_v = make_device_matrix_view( + indices.data_handle(), params_.n_queries, params_.k); + auto dist_v = make_device_matrix_view( + distances.data_handle(), params_.n_queries, params_.k); + + auto queries_v = make_const_mdspan(queries_.view()); + if (params_.removed_ratio > 0) { + auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view()); + loop_on_state(state, [&]() { + raft::neighbors::cagra::search_with_filtering( + this->handle, search_params, *this->index_, queries_v, ind_v, dist_v, filter); + }); + } else { + loop_on_state(state, [&]() { + raft::neighbors::cagra::search( + this->handle, search_params, *this->index_, queries_v, ind_v, dist_v); + }); + } + + double data_size = params_.n_samples * params_.n_dims * sizeof(T); + double graph_size = params_.n_samples * params_.degree * sizeof(IdxT); + + int iterations = params_.max_iterations; + if (iterations == 0) { + // see search_plan_impl::adjust_search_params() + double r = params_.itopk_size / static_cast(params_.search_width); + iterations = 1 + std::min(r * 1.1, r + 10); + } + state.counters["dataset (GiB)"] = data_size / (1 << 30); + state.counters["graph (GiB)"] = graph_size / (1 << 30); + state.counters["n_rows"] = params_.n_samples; + state.counters["n_cols"] = params_.n_dims; + state.counters["degree"] = params_.degree; + state.counters["n_queries"] = params_.n_queries; + state.counters["k"] = params_.k; + state.counters["itopk_size"] = params_.itopk_size; + state.counters["block_size"] = params_.block_size; + state.counters["search_width"] = params_.search_width; + state.counters["iterations"] = iterations; + state.counters["removed_ratio"] = params_.removed_ratio; + } + + private: + const params params_; + std::optional> index_; + raft::device_matrix queries_; + raft::device_matrix dataset_; + raft::device_matrix knn_graph_; + raft::core::bitset removed_indices_bitset_; +}; + +inline const std::vector generate_inputs() +{ + std::vector inputs = + raft::util::itertools::product({2000000ull}, // n_samples + {128, 256, 512, 1024}, // dataset dim + {1000}, // n_queries + {32}, // k + {64}, // knn graph degree + {64}, // itopk_size + {0}, // block_size + {1}, // search_width + {0}, // max_iterations + {0.0} // removed_ratio + ); + auto inputs2 = raft::util::itertools::product({2000000ull, 10000000ull}, // n_samples + {128}, // dataset dim + {1000}, // n_queries + {32}, // k + {64}, // knn graph degree + {64}, // itopk_size + {64, 128, 256, 512, 1024}, // block_size + {1}, // search_width + {0}, // max_iterations + {0.0} // removed_ratio + ); + inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + + inputs2 = raft::util::itertools::product( + {2000000ull, 10000000ull}, // n_samples + {128}, // dataset dim + {1, 10, 10000}, // n_queries + {255}, // k + {64}, // knn graph degree + {300}, // itopk_size + {256}, // block_size + {2}, // search_width + {0}, // max_iterations + {0.0, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64} // removed_ratio + ); + inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + return inputs; +} + +const std::vector kCagraInputs = generate_inputs(); + +#define CAGRA_REGISTER(ValT, IdxT, inputs) \ + namespace BENCHMARK_PRIVATE_NAME(knn) { \ + using AnnCagra = CagraBench; \ + RAFT_BENCH_REGISTER(AnnCagra, #ValT "/" #IdxT, inputs); \ + } + +} // namespace raft::bench::neighbors diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh new file mode 100644 index 000000000..55865b577 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn.cuh @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace raft::bench::spatial { + +struct params { + /** Size of the dataset. */ + size_t n_samples; + /** Number of dimensions in the dataset. */ + size_t n_dims; + /** The batch size -- number of KNN searches. */ + size_t n_queries; + /** Number of nearest neighbours to find for every probe. */ + size_t k; + /** Ratio of removed indices. */ + double removed_ratio; +}; + +inline auto operator<<(std::ostream& os, const params& p) -> std::ostream& +{ + os << p.n_samples << "#" << p.n_dims << "#" << p.n_queries << "#" << p.k << "#" + << p.removed_ratio; + return os; +} + +enum class TransferStrategy { NO_COPY, COPY_PLAIN, COPY_PINNED, MAP_PINNED, MANAGED }; // NOLINT +enum class Scope { BUILD, SEARCH, BUILD_SEARCH }; // NOLINT + +inline auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream& +{ + switch (ts) { + case TransferStrategy::NO_COPY: os << "NO_COPY"; break; + case TransferStrategy::COPY_PLAIN: os << "COPY_PLAIN"; break; + case TransferStrategy::COPY_PINNED: os << "COPY_PINNED"; break; + case TransferStrategy::MAP_PINNED: os << "MAP_PINNED"; break; + case TransferStrategy::MANAGED: os << "MANAGED"; break; + default: os << "UNKNOWN"; + } + return os; +} + +inline auto operator<<(std::ostream& os, const Scope& s) -> std::ostream& +{ + switch (s) { + case Scope::BUILD: os << "BUILD"; break; + case Scope::SEARCH: os << "SEARCH"; break; + case Scope::BUILD_SEARCH: os << "BUILD_SEARCH"; break; + default: os << "UNKNOWN"; + } + return os; +} + +struct device_resource { + public: + explicit device_resource(bool managed) : managed_(managed) + { + if (managed_) { + res_ = new rmm::mr::managed_memory_resource(); + } else { + res_ = rmm::mr::get_current_device_resource(); + } + } + + ~device_resource() + { + if (managed_) { delete res_; } + } + + [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; } + + private: + const bool managed_; + rmm::mr::device_memory_resource* res_; +}; + +template +struct host_uvector { + host_uvector(size_t n, bool pinned) : n_(n) + { + if (pinned) { + res_ = new rmm::mr::pinned_memory_resource(); + } else { + res_ = new rmm::mr::new_delete_resource(); + } + arr_ = static_cast(res_->allocate(n_ * sizeof(T))); + } + + ~host_uvector() noexcept + { + res_->deallocate(arr_, n_ * sizeof(T)); + delete res_; + } + + auto data() -> T* { return arr_; } + [[nodiscard]] auto size() const -> size_t { return n_; } + + private: + rmm::mr::host_memory_resource* res_; + size_t n_; + T* arr_; +}; + +template +struct ivf_flat_knn { + using dist_t = float; + + std::optional> index; + raft::neighbors::ivf_flat::index_params index_params; + raft::neighbors::ivf_flat::search_params search_params; + params ps; + + ivf_flat_knn(const raft::device_resources& handle, const params& ps, const ValT* data) : ps(ps) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + index.emplace(raft::neighbors::ivf_flat::build( + handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims))); + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + raft::neighbors::ivf_flat::search( + handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists); + } +}; + +template +struct ivf_pq_knn { + using dist_t = float; + + std::optional> index; + raft::neighbors::ivf_pq::index_params index_params; + raft::neighbors::ivf_pq::search_params search_params; + params ps; + + ivf_pq_knn(const raft::device_resources& handle, const params& ps, const ValT* data) : ps(ps) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + auto data_view = raft::make_device_matrix_view(data, ps.n_samples, ps.n_dims); + index.emplace(raft::neighbors::ivf_pq::build(handle, index_params, data_view)); + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + auto queries_view = + raft::make_device_matrix_view(search_items, ps.n_queries, ps.n_dims); + auto idxs_view = raft::make_device_matrix_view(out_idxs, ps.n_queries, ps.k); + auto dists_view = + raft::make_device_matrix_view(out_dists, ps.n_queries, ps.k); + raft::neighbors::ivf_pq::search( + handle, search_params, *index, queries_view, idxs_view, dists_view); + } +}; + +template +struct brute_force_knn { + using dist_t = ValT; + + ValT* index; + params ps; + + brute_force_knn(const raft::device_resources& handle, const params& ps, const ValT* data) + : index(const_cast(data)), ps(ps) + { + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + std::vector input{index}; + std::vector sizes{ps.n_samples}; + raft::spatial::knn::brute_force_knn(handle, + input, + sizes, + ps.n_dims, + const_cast(search_items), + ps.n_queries, + out_idxs, + out_dists, + ps.k); + } +}; + +template +struct ivf_flat_filter_knn { + using dist_t = float; + + std::optional> index; + raft::neighbors::ivf_flat::index_params index_params; + raft::neighbors::ivf_flat::search_params search_params; + raft::core::bitset removed_indices_bitset_; + params ps; + + ivf_flat_filter_knn(const raft::device_resources& handle, const params& ps, const ValT* data) + : ps(ps), removed_indices_bitset_(handle, ps.n_samples) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + index.emplace(raft::neighbors::ivf_flat::build( + handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims))); + auto removed_indices = + raft::make_device_vector(handle, ps.removed_ratio * ps.n_samples); + thrust::sequence( + resource::get_thrust_policy(handle), + thrust::device_pointer_cast(removed_indices.data_handle()), + thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0))); + removed_indices_bitset_.set(handle, removed_indices.view()); + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + auto queries_view = + raft::make_device_matrix_view(search_items, ps.n_queries, ps.n_dims); + auto neighbors_view = raft::make_device_matrix_view(out_idxs, ps.n_queries, ps.k); + auto distance_view = raft::make_device_matrix_view(out_dists, ps.n_queries, ps.k); + auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view()); + + if (ps.removed_ratio > 0) { + raft::neighbors::ivf_flat::search_with_filtering( + handle, search_params, *index, queries_view, neighbors_view, distance_view, filter); + } else { + raft::neighbors::ivf_flat::search( + handle, search_params, *index, queries_view, neighbors_view, distance_view); + } + } +}; + +template +struct ivf_pq_filter_knn { + using dist_t = float; + + std::optional> index; + raft::neighbors::ivf_pq::index_params index_params; + raft::neighbors::ivf_pq::search_params search_params; + raft::core::bitset removed_indices_bitset_; + params ps; + + ivf_pq_filter_knn(const raft::device_resources& handle, const params& ps, const ValT* data) + : ps(ps), removed_indices_bitset_(handle, ps.n_samples) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + auto data_view = raft::make_device_matrix_view(data, ps.n_samples, ps.n_dims); + index.emplace(raft::neighbors::ivf_pq::build(handle, index_params, data_view)); + auto removed_indices = + raft::make_device_vector(handle, ps.removed_ratio * ps.n_samples); + thrust::sequence( + resource::get_thrust_policy(handle), + thrust::device_pointer_cast(removed_indices.data_handle()), + thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0))); + removed_indices_bitset_.set(handle, removed_indices.view()); + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + auto queries_view = + raft::make_device_matrix_view(search_items, ps.n_queries, ps.n_dims); + auto neighbors_view = + raft::make_device_matrix_view(out_idxs, ps.n_queries, ps.k); + auto distance_view = + raft::make_device_matrix_view(out_dists, ps.n_queries, ps.k); + auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view()); + + if (ps.removed_ratio > 0) { + raft::neighbors::ivf_pq::search_with_filtering( + handle, search_params, *index, queries_view, neighbors_view, distance_view, filter); + } else { + raft::neighbors::ivf_pq::search( + handle, search_params, *index, queries_view, neighbors_view, distance_view); + } + } +}; + +template +struct knn : public fixture { + explicit knn(const params& p, const TransferStrategy& strategy, const Scope& scope) + : fixture(true), + params_(p), + strategy_(strategy), + scope_(scope), + dev_mem_res_(strategy == TransferStrategy::MANAGED), + data_host_(0), + search_items_(p.n_queries * p.n_dims, stream), + out_dists_(p.n_queries * p.k, stream), + out_idxs_(p.n_queries * p.k, stream) + { + raft::random::RngState state{42}; + gen_data(state, search_items_, search_items_.size(), stream); + try { + size_t total_size = p.n_samples * p.n_dims; + data_host_.resize(total_size); + constexpr size_t kGenMinibatchSize = 1024 * 1024 * 1024; + rmm::device_uvector d(std::min(kGenMinibatchSize, total_size), stream); + for (size_t offset = 0; offset < total_size; offset += kGenMinibatchSize) { + size_t actual_size = std::min(total_size - offset, kGenMinibatchSize); + gen_data(state, d, actual_size, stream); + copy(data_host_.data() + offset, d.data(), actual_size, stream); + } + } catch (std::bad_alloc& e) { + data_does_not_fit_ = true; + } + } + + template + void gen_data(raft::random::RngState& state, // NOLINT + rmm::device_uvector& vec, + size_t n, + rmm::cuda_stream_view stream) + { + constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); + constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); + if constexpr (std::is_integral_v) { + raft::random::uniformInt(handle, state, vec.data(), n, kRangeMin, kRangeMax); + } else { + raft::random::uniform(handle, state, vec.data(), n, kRangeMin, kRangeMax); + } + } + + void run_benchmark(::benchmark::State& state) override + { + if (data_does_not_fit_) { + state.SkipWithError("The data size is too big to fit into the host memory."); + } + if (scope_ == Scope::SEARCH && strategy_ != TransferStrategy::NO_COPY) { + state.SkipWithError( + "When benchmarking without index building (Scope::SEARCH), the data must be already on the " + "device (TransferStrategy::NO_COPY)"); + } + + try { + std::ostringstream label_stream; + label_stream << params_ << "#" << strategy_ << "#" << scope_; + state.SetLabel(label_stream.str()); + raft::device_resources handle(stream); + std::optional index; + + if (scope_ == Scope::SEARCH) { // also implies TransferStrategy::NO_COPY + rmm::device_uvector data(data_host_.size(), stream); + copy(data.data(), data_host_.data(), data_host_.size(), stream); + index.emplace(handle, params_, data.data()); + stream.synchronize(); + } + + // benchmark loop + for (auto _ : state) { + // managed or plain device memory initialized anew every time + rmm::device_uvector data(data_host_.size(), stream, dev_mem_res_.get()); + ValT* data_ptr = data.data(); + size_t allocation_size = data_host_.size() * sizeof(ValT); + + // Non-benchmarked part: using different methods to copy the data if necessary + switch (strategy_) { + case TransferStrategy::NO_COPY: // copy data to GPU before starting the timer. + copy(data_ptr, data_host_.data(), data_host_.size(), stream); + break; + case TransferStrategy::COPY_PINNED: + RAFT_CUDA_TRY( + cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterDefault)); + break; + case TransferStrategy::MAP_PINNED: + RAFT_CUDA_TRY( + cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterMapped)); + RAFT_CUDA_TRY(cudaHostGetDevicePointer(&data_ptr, data_host_.data(), 0)); + break; + case TransferStrategy::MANAGED: // sic! using std::memcpy rather than cuda copy + RAFT_CUDA_TRY(cudaMemAdvise(data_ptr, + allocation_size, + cudaMemAdviseSetPreferredLocation, + resource::get_device_id(handle))); + RAFT_CUDA_TRY(cudaMemAdvise(data_ptr, + allocation_size, + cudaMemAdviseSetAccessedBy, + resource::get_device_id(handle))); + RAFT_CUDA_TRY(cudaMemAdvise(data_ptr, + allocation_size, + cudaMemAdviseSetReadMostly, + resource::get_device_id(handle))); + std::memcpy(data_ptr, data_host_.data(), allocation_size); + break; + default: break; + } + + flush_L2_cache(); + { + // Timer synchronizes the stream, so all prior gpu work should be done before it sets off. + cuda_event_timer timer(state, stream); + switch (strategy_) { + case TransferStrategy::COPY_PLAIN: + case TransferStrategy::COPY_PINNED: + copy(data_ptr, data_host_.data(), data_host_.size(), stream); + default: break; + } + + if (scope_ != Scope::SEARCH) { index.emplace(handle, params_, data_ptr); } + if (scope_ != Scope::BUILD) { + index->search(handle, search_items_.data(), out_dists_.data(), out_idxs_.data()); + } + } + + if (scope_ != Scope::SEARCH) { index.reset(); } + + switch (strategy_) { + case TransferStrategy::COPY_PINNED: + case TransferStrategy::MAP_PINNED: + RAFT_CUDA_TRY(cudaHostUnregister(data_host_.data())); + break; + default: break; + } + } + } catch (raft::exception& e) { + state.SkipWithError(e.what()); + } catch (std::bad_alloc& e) { + state.SkipWithError(e.what()); + } + } + + private: + const params params_; + const TransferStrategy strategy_; + const Scope scope_; + device_resource dev_mem_res_; + bool data_does_not_fit_ = false; + + std::vector data_host_; + rmm::device_uvector search_items_; + rmm::device_uvector out_dists_; + rmm::device_uvector out_idxs_; +}; + +inline const std::vector kInputs{ + {2000000, 128, 1000, 32, 0}, {10000000, 128, 1000, 32, 0}, {10000, 8192, 1000, 32, 0}}; + +const std::vector kInputsFilter = + raft::util::itertools::product({size_t(10000000)}, // n_samples + {size_t(128)}, // n_dim + {size_t(1000)}, // n_queries + {size_t(255)}, // k + {0.0, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64} // removed_ratio + ); +inline const std::vector kAllStrategies{ + TransferStrategy::NO_COPY, TransferStrategy::MAP_PINNED, TransferStrategy::MANAGED}; +inline const std::vector kNoCopyOnly{TransferStrategy::NO_COPY}; + +inline const std::vector kScopeFull{Scope::BUILD_SEARCH}; +inline const std::vector kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::BUILD}; + +#define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope) \ + namespace BENCHMARK_PRIVATE_NAME(knn) { \ + using KNN = knn>; \ + RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \ + } + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu new file mode 100644 index 000000000..7df059967 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu new file mode 100644 index 000000000..9704d39e7 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu b/cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu new file mode 100644 index 000000000..5d762f6e8 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../cagra_bench.cuh" + +namespace raft::bench::neighbors { + +CAGRA_REGISTER(float, uint32_t, kCagraInputs); + +} // namespace raft::bench::neighbors diff --git a/cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu new file mode 100644 index 000000000..bf5118cea --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#undef RAFT_EXPLICIT_INSTANTIATE_ONLY // Enable instantiation of search with filter +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_flat_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu new file mode 100644 index 000000000..fbbb4f9ac --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu new file mode 100644 index 000000000..7067dbe1b --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu new file mode 100644 index 000000000..91fada3c2 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(uint8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu new file mode 100644 index 000000000..9534515cb --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#undef RAFT_EXPLICIT_INSTANTIATE_ONLY // Enable instantiation of search with filter +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_pq_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu new file mode 100644 index 000000000..83c4973c3 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu new file mode 100644 index 000000000..4ea281b11 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(int8_t, int64_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu new file mode 100644 index 000000000..3313a49ba --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(uint8_t, int64_t, ivf_pq_knn, kInputs, kNoCopyOnly, kAllScopes); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/refine.cuh b/cpp/bench/prims/neighbors/refine.cuh new file mode 100644 index 000000000..121917f34 --- /dev/null +++ b/cpp/bench/prims/neighbors/refine.cuh @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +using namespace raft::neighbors; + +namespace raft::bench::neighbors { + +template +inline auto operator<<(std::ostream& os, const RefineInputs& p) -> std::ostream& +{ + os << p.n_rows << "#" << p.dim << "#" << p.n_queries << "#" << p.k0 << "#" << p.k << "#" + << (p.host_data ? "host" : "device"); + return os; +} + +template +class RefineAnn : public fixture { + public: + RefineAnn(RefineInputs p) : data(handle_, p) {} + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << data.p; + state.SetLabel(label_stream.str()); + + auto old_mr = rmm::mr::get_current_device_resource(); + rmm::mr::pool_memory_resource pool_mr(old_mr); + rmm::mr::set_current_device_resource(&pool_mr); + + if (data.p.host_data) { + loop_on_state(state, [this]() { + raft::neighbors::refine(handle_, + data.dataset_host.view(), + data.queries_host.view(), + data.candidates_host.view(), + data.refined_indices_host.view(), + data.refined_distances_host.view(), + data.p.metric); + }); + } else { + loop_on_state(state, [&]() { + raft::neighbors::refine(handle_, + data.dataset.view(), + data.queries.view(), + data.candidates.view(), + data.refined_indices.view(), + data.refined_distances.view(), + data.p.metric); + }); + } + rmm::mr::set_current_device_resource(old_mr); + } + + private: + raft::device_resources handle_; + RefineHelper data; +}; + +template +std::vector> getInputs() +{ + std::vector> out; + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; + for (bool host_data : {true, false}) { + for (T n_queries : {1000, 10000}) { + for (T dim : {128, 512}) { + out.push_back(RefineInputs{n_queries, 2000000, dim, 32, 128, metric, host_data}); + out.push_back(RefineInputs{n_queries, 2000000, dim, 10, 40, metric, host_data}); + } + } + } + return out; +} + +} // namespace raft::bench::neighbors diff --git a/cpp/bench/prims/neighbors/refine_float_int64_t.cu b/cpp/bench/prims/neighbors/refine_float_int64_t.cu new file mode 100644 index 000000000..bbedc1ae6 --- /dev/null +++ b/cpp/bench/prims/neighbors/refine_float_int64_t.cu @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "refine.cuh" +#include + +using namespace raft::neighbors; + +namespace raft::bench::neighbors { +using refine_float_int64 = RefineAnn; +RAFT_BENCH_REGISTER(refine_float_int64, "", getInputs()); +} // namespace raft::bench::neighbors diff --git a/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu new file mode 100644 index 000000000..4952361f0 --- /dev/null +++ b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "refine.cuh" +#include + +using namespace raft::neighbors; + +namespace raft::bench::neighbors { +using refine_uint8_int64 = RefineAnn; +RAFT_BENCH_REGISTER(refine_uint8_int64, "", getInputs()); +} // namespace raft::bench::neighbors diff --git a/cpp/bench/prims/random/make_blobs.cu b/cpp/bench/prims/random/make_blobs.cu new file mode 100644 index 000000000..f43d914cf --- /dev/null +++ b/cpp/bench/prims/random/make_blobs.cu @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft::bench::random { +struct make_blobs_inputs { + int rows, cols, clusters; + bool row_major; +}; // struct make_blobs_inputs + +inline auto operator<<(std::ostream& os, const make_blobs_inputs& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.clusters << "#" << p.row_major; + return os; +} + +template +struct make_blobs : public fixture { + make_blobs(const make_blobs_inputs& p) + : params(p), data(p.rows * p.cols, stream), labels(p.rows, stream) + { + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + raft::random::make_blobs(data.data(), + labels.data(), + params.rows, + params.cols, + params.clusters, + this->stream, + params.row_major); + }); + } + + private: + make_blobs_inputs params; + rmm::device_uvector data; + rmm::device_uvector labels; +}; // struct MakeBlobs + +static std::vector get_make_blobs_input_vecs() +{ + std::vector out; + make_blobs_inputs p; + for (auto rows : std::vector{100000, 1000000}) { + for (auto cols : std::vector{10, 100}) { + for (auto clusters : std::vector{2, 10, 100}) { + p.rows = rows; + p.cols = cols; + p.clusters = clusters; + p.row_major = true; + out.push_back(p); + p.row_major = false; + out.push_back(p); + } + } + } + return out; +} + +RAFT_BENCH_REGISTER(make_blobs, "", get_make_blobs_input_vecs()); +RAFT_BENCH_REGISTER(make_blobs, "", get_make_blobs_input_vecs()); + +} // namespace raft::bench::random diff --git a/cpp/bench/prims/random/permute.cu b/cpp/bench/prims/random/permute.cu new file mode 100644 index 000000000..829cf4272 --- /dev/null +++ b/cpp/bench/prims/random/permute.cu @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft::bench::random { + +struct permute_inputs { + int rows, cols; + bool needPerms, needShuffle, rowMajor; +}; // struct permute_inputs + +template +struct permute : public fixture { + permute(const permute_inputs& p) + : params(p), + perms(p.needPerms ? p.rows : 0, stream), + out(p.rows * p.cols, stream), + in(p.rows * p.cols, stream) + { + raft::random::RngState r(123456ULL); + uniform(handle, r, in.data(), p.rows, T(-1.0), T(1.0)); + } + + void run_benchmark(::benchmark::State& state) override + { + raft::random::RngState r(123456ULL); + loop_on_state(state, [this, &r]() { + raft::random::permute( + perms.data(), out.data(), in.data(), params.cols, params.rows, params.rowMajor, stream); + }); + } + + private: + raft::device_resources handle; + permute_inputs params; + rmm::device_uvector out, in; + rmm::device_uvector perms; +}; // struct permute + +const std::vector permute_input_vecs = { + {32 * 1024, 128, true, true, true}, + {1024 * 1024, 128, true, true, true}, + {32 * 1024, 128 + 2, true, true, true}, + {1024 * 1024, 128 + 2, true, true, true}, + {32 * 1024, 128 + 1, true, true, true}, + {1024 * 1024, 128 + 1, true, true, true}, + + {32 * 1024, 128, true, true, false}, + {1024 * 1024, 128, true, true, false}, + {32 * 1024, 128 + 2, true, true, false}, + {1024 * 1024, 128 + 2, true, true, false}, + {32 * 1024, 128 + 1, true, true, false}, + {1024 * 1024, 128 + 1, true, true, false}, + +}; + +RAFT_BENCH_REGISTER(permute, "", permute_input_vecs); +RAFT_BENCH_REGISTER(permute, "", permute_input_vecs); + +} // namespace raft::bench::random diff --git a/cpp/bench/prims/random/rng.cu b/cpp/bench/prims/random/rng.cu new file mode 100644 index 000000000..d15c9441d --- /dev/null +++ b/cpp/bench/prims/random/rng.cu @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace raft::bench::random { + +enum RandomType { + RNG_Normal, + RNG_LogNormal, + RNG_Uniform, + RNG_Gumbel, + RNG_Logistic, + RNG_Exp, + RNG_Rayleigh, + RNG_Laplace, + RNG_Fill +}; // enum RandomType + +template +struct rng_inputs { + int len; + RandomType type; + raft::random::GeneratorType gtype; + T start, end; +}; // struct rng_inputs + +template +struct rng : public fixture { + rng(const rng_inputs& p) : params(p), ptr(p.len, stream) {} + + void run_benchmark(::benchmark::State& state) override + { + raft::random::RngState r(123456ULL, params.gtype); + loop_on_state(state, [this, &r]() { + switch (params.type) { + case RNG_Normal: normal(handle, r, ptr.data(), params.len, params.start, params.end); break; + case RNG_LogNormal: + lognormal(handle, r, ptr.data(), params.len, params.start, params.end); + break; + case RNG_Uniform: + uniform(handle, r, ptr.data(), params.len, params.start, params.end); + break; + case RNG_Gumbel: gumbel(handle, r, ptr.data(), params.len, params.start, params.end); break; + case RNG_Logistic: + logistic(handle, r, ptr.data(), params.len, params.start, params.end); + break; + case RNG_Exp: exponential(handle, r, ptr.data(), params.len, params.start); break; + case RNG_Rayleigh: rayleigh(handle, r, ptr.data(), params.len, params.start); break; + case RNG_Laplace: + laplace(handle, r, ptr.data(), params.len, params.start, params.end); + break; + case RNG_Fill: fill(handle, r, ptr.data(), params.len, params.start); break; + }; + }); + } + + private: + rng_inputs params; + rmm::device_uvector ptr; +}; // struct RngBench + +template +static std::vector> get_rng_input_vecs() +{ + using namespace raft::random; + return { + {1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {32 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {32 * 1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {32 * 1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, + + {1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {32 * 1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {32 * 1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {32 * 1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)}, + + {1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {32 * 1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {32 * 1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {32 * 1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + {1024 * 1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)}, + }; +} + +RAFT_BENCH_REGISTER(rng, "", get_rng_input_vecs()); +RAFT_BENCH_REGISTER(rng, "", get_rng_input_vecs()); + +} // namespace raft::bench::random diff --git a/cpp/bench/prims/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu new file mode 100644 index 000000000..634c749a5 --- /dev/null +++ b/cpp/bench/prims/sparse/convert_csr.cu @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +namespace raft::bench::sparse { + +template +struct bench_param { + index_t num_cols; + index_t num_rows; + index_t divisor; +}; + +template +RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) +{ + index_t r = blockDim.y * blockIdx.y + threadIdx.y; + index_t c = blockDim.x * blockIdx.x + threadIdx.x; + + for (; r < num_rows; r += gridDim.y * blockDim.y) { + for (; c < num_cols; c += gridDim.x * blockDim.x) { + adj[r * num_cols + c] = c % divisor == 0; + } + } +} + +template +void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream) +{ + // adj matrix: element a_ij is set to one if j is divisible by divisor. + dim3 block(32, 32); + const index_t max_y_grid_dim = 65535; + dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim)); + init_adj_kernel<<>>(adj, num_rows, num_cols, divisor); + RAFT_CHECK_CUDA(stream); +} + +template +struct bench_base : public fixture { + bench_base(const bench_param& p) + : params(p), + handle(stream), + adj(p.num_rows * p.num_cols, stream), + row_ind(p.num_rows, stream), + row_ind_host(p.num_rows), + row_counters(p.num_rows, stream), + // col_ind is over-dimensioned because nnz is unknown at this point + col_ind(p.num_rows * p.num_cols, stream) + { + init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream); + + std::vector row_ind_host(p.num_rows); + for (size_t i = 0; i < row_ind_host.size(); ++i) { + size_t nnz_per_row = raft::ceildiv(p.num_cols, p.divisor); + row_ind_host[i] = nnz_per_row * i; + } + raft::update_device(row_ind.data(), row_ind_host.data(), row_ind.size(), stream); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::sparse::convert::adj_to_csr(handle, + adj.data(), + row_ind.data(), + params.num_rows, + params.num_cols, + row_counters.data(), + col_ind.data()); + }); + + // Estimate bandwidth: + index_t num_entries = params.num_rows * params.num_cols; + index_t bytes_read = num_entries * sizeof(bool); + index_t bytes_write = num_entries / params.divisor * sizeof(index_t); + + state.counters["BW"] = benchmark::Counter(bytes_read + bytes_write, + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1024); + state.counters["BW read"] = benchmark::Counter( + bytes_read, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024); + state.counters["BW write"] = benchmark::Counter(bytes_write, + benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::OneK::kIs1024); + + state.counters["Fraction nz"] = benchmark::Counter(100.0 / ((double)params.divisor)); + state.counters["Columns"] = benchmark::Counter(params.num_cols); + state.counters["Rows"] = benchmark::Counter(params.num_rows); + } + + protected: + raft::device_resources handle; + bench_param params; + rmm::device_uvector adj; + rmm::device_uvector row_ind; + std::vector row_ind_host; + rmm::device_uvector row_counters; + rmm::device_uvector col_ind; +}; // struct bench_base + +const int64_t num_cols = 1 << 30; + +const std::vector> bench_params = { + {num_cols, 1, 8}, + {num_cols >> 3, 1 << 3, 8}, + {num_cols >> 6, 1 << 6, 8}, + + {num_cols, 1, 64}, + {num_cols >> 3, 1 << 3, 64}, + {num_cols >> 6, 1 << 6, 64}, + + {num_cols, 1, 2048}, + {num_cols >> 3, 1 << 3, 2048}, + {num_cols >> 6, 1 << 6, 2048}, +}; + +RAFT_BENCH_REGISTER(bench_base, "", bench_params); + +} // namespace raft::bench::sparse diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json new file mode 100644 index 000000000..f7cc50e51 --- /dev/null +++ b/cpp/cmake/config.json @@ -0,0 +1,43 @@ +{ + "parse": { + "additional_commands": { + "CPMFindPackage": { + "kwargs": { + "NAME": 1, + "GITHUB_REPOSITORY": "?", + "GIT_TAG": "?", + "VERSION": "?", + "GIT_SHALLOW": "?", + "OPTIONS": "*", + "FIND_PACKAGE_ARGUMENTS": "*" + } + }, + "ConfigureTest": { + "flags": ["TEST_NAME", "TEST_SRC"] + }, + "ConfigureBench": { + "flags": ["BENCH_NAME", "BENCH_SRC"] + } + } + }, + "format": { + "line_width": 100, + "tab_size": 2, + "command_case": "unchanged", + "max_lines_hwrap": 1, + "max_pargs_hwrap": 999, + "dangle_parens": true + }, + "lint": { + "disabled_codes": ["C0301", "C0111", "C0113"], + "function_pattern": "[0-9A-z_]+", + "macro_pattern": "[0-9A-z_]+", + "global_var_pattern": "[A-z][0-9A-z_]+", + "internal_var_pattern": "_[A-z][0-9A-z_]+", + "local_var_pattern": "[A-z][A-z0-9_]+", + "private_var_pattern": "_[0-9A-z_]+", + "public_var_pattern": "[A-z][0-9A-z_]+", + "argument_var_pattern": "[A-z][A-z0-9_]+", + "keyword_pattern": "[A-z][0-9A-z_]+" + } +} diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake new file mode 100644 index 000000000..ea8a077b0 --- /dev/null +++ b/cpp/cmake/modules/ConfigureCUDA.cmake @@ -0,0 +1,57 @@ +# ============================================================================= +# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +if(DISABLE_DEPRECATION_WARNINGS) + list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations) + list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations) +endif() + +# Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with +# clang) +if(CMAKE_COMPILER_IS_GNUCXX) + list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations) + list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations) + + # set warnings as errors + if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0) + list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings) + endif() +endif() + +if(CUDA_LOG_COMPILE_TIME) + list(APPEND RAFT_CUDA_FLAGS "--time=nvcc_compile_log.csv") +endif() + +list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) +list(APPEND RAFT_CXX_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM") +list(APPEND RAFT_CUDA_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM") +# make sure we produce smallest binary size +list(APPEND RAFT_CUDA_FLAGS -Xfatbin=-compress-all) + +# Option to enable line info in CUDA device compilation to allow introspection when profiling / +# memchecking +if(CUDA_ENABLE_LINEINFO) + list(APPEND RAFT_CUDA_FLAGS -lineinfo) +endif() + +if(OpenMP_FOUND) + list(APPEND RAFT_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS}) +endif() + +# Debug options +if(CMAKE_BUILD_TYPE MATCHES Debug) + message(VERBOSE "RAFT: Building with debugging flags") + list(APPEND RAFT_CUDA_FLAGS -G -Xcompiler=-rdynamic) + list(APPEND RAFT_CUDA_FLAGS -Xptxas --suppress-stack-size-warning) +endif() diff --git a/cpp/cmake/modules/FindAVX.cmake b/cpp/cmake/modules/FindAVX.cmake new file mode 100644 index 000000000..7f3b2dfc7 --- /dev/null +++ b/cpp/cmake/modules/FindAVX.cmake @@ -0,0 +1,110 @@ +# ============================================================================= +# Copyright (c) 2016- Facebook, Inc (Adam Paszke) +# Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +# Copyright (c) 2011-2013 NYU (Clement Farabet) +# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +# Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) +# +# Note: This file was copied from PyTorch and modified for use in the RAFT library. +# Refer to thirdparty/LICENSES/LICENSE.pytorch for license and additional +# copyright information. +# ============================================================================= + +INCLUDE(CheckCXXSourceRuns) + +SET(AVX_CODE + " + #include + + int main() + { + __m256 a; + a = _mm256_set1_ps(0); + return 0; + } +" +) + +SET(AVX512_CODE + " + #include + + int main() + { + __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); + __m512i b = a; + __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ); + return 0; + } +" +) + +SET(AVX2_CODE + " + #include + + int main() + { + __m256i a = {0}; + a = _mm256_abs_epi16(a); + __m256i x; + _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code + return 0; + } +" +) + +MACRO(CHECK_SSE lang type flags) + SET(__FLAG_I 1) + SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + FOREACH(__FLAG ${flags}) + IF(NOT ${lang}_${type}_FOUND) + SET(CMAKE_REQUIRED_FLAGS ${__FLAG}) + CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I}) + IF(${lang}_HAS_${type}_${__FLAG_I}) + SET(${lang}_${type}_FOUND + TRUE + CACHE BOOL "${lang} ${type} support" + ) + SET(${lang}_${type}_FLAGS + "${__FLAG}" + CACHE STRING "${lang} ${type} flags" + ) + ENDIF() + MATH(EXPR __FLAG_I "${__FLAG_I}+1") + ENDIF() + ENDFOREACH() + SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + IF(NOT ${lang}_${type}_FOUND) + SET(${lang}_${type}_FOUND + FALSE + CACHE BOOL "${lang} ${type} support" + ) + SET(${lang}_${type}_FLAGS + "" + CACHE STRING "${lang} ${type} flags" + ) + ENDIF() + + MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS) + +ENDMACRO() + +# CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX") CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2") CHECK_SSE(C +# "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512") +# +CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX") +CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2") +CHECK_SSE(CXX "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512") diff --git a/cpp/cmake/patches/ggnn.patch b/cpp/cmake/patches/ggnn.patch new file mode 100644 index 000000000..21df3bd73 --- /dev/null +++ b/cpp/cmake/patches/ggnn.patch @@ -0,0 +1,229 @@ +diff --git a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh +index 890420e..d792903 100644 +--- a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh ++++ b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh +@@ -62,7 +62,7 @@ struct SimpleKNNSymCache { + const ValueT dist_half) + : dist_query(dist_query), dist_half(dist_half) {} + +- __device__ __forceinline__ DistQueryAndHalf() {} ++ DistQueryAndHalf() = default; + }; + + struct DistanceAndNorm { +@@ -98,8 +98,7 @@ struct SimpleKNNSymCache { + KeyT cache; + DistQueryAndHalf dist; + bool flag; +- +- __device__ __forceinline__ SyncTempStorage() {} ++ SyncTempStorage() = default; + }; + + public: +diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh +index 8cbaf0d..6eb72ac 100644 +--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh ++++ b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh +@@ -41,7 +41,6 @@ limitations under the License. + #include "ggnn/sym/cuda_knn_sym_query_layer.cuh" + #include "ggnn/utils/cuda_knn_utils.cuh" + #include "ggnn/utils/cuda_knn_constants.cuh" +-#include "ggnn/utils/cuda_knn_dataset.cuh" + + template + __global__ void divide(ValueT* res, ValueT* input, ValueT N) { +@@ -98,9 +97,7 @@ struct GGNNGPUInstance { + typedef GGNNGraphDevice GGNNGraphDevice; + typedef GGNNGraphHost GGNNGraphHost; + +- const Dataset* dataset; + GGNNGraphBuffer* ggnn_buffer {nullptr}; +- GGNNQuery ggnn_query; + + // Graph Shards resident on the GPU + std::vector ggnn_shards; +@@ -117,13 +114,12 @@ struct GGNNGPUInstance { + // number of shards that need to be processed by this instance + const int num_parts; + +- GGNNGPUInstance(const int gpu_id, const Dataset* dataset, ++ GGNNGPUInstance(const int gpu_id, + const int N_shard, const int L, + const bool enable_construction, const float tau_build, + const int num_parts=1, const int num_cpu_buffers=1) : + N_shard{N_shard}, L{L}, tau_build{tau_build}, +- dataset{dataset}, gpu_id{gpu_id}, +- ggnn_query{dataset->N_query, D, KQuery, num_parts}, ++ gpu_id{gpu_id}, + num_parts{num_parts} + { + CHECK_LE(L, MAX_LAYER); +@@ -135,7 +131,6 @@ struct GGNNGPUInstance { + CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!"; + } + +- ggnn_query.loadQueriesAsync(dataset->h_query, 0); + + computeGraphParameters(); + +@@ -186,7 +181,7 @@ struct GGNNGPUInstance { + } + + GGNNGPUInstance(const GGNNGPUInstance& other) +- : dataset{nullptr}, ggnn_query{0, D, KQuery}, ++ : + gpu_id{0}, N_shard{0}, num_parts{0} { + // this exists to allow using vector::emplace_back + // when it triggers a reallocation, this code will be called. +@@ -305,6 +300,7 @@ struct GGNNGPUInstance { + + // io + ++ /* + void waitForDiskIO(const int shard_id) { + auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()]; + if (cpu_buffer.disk_io_thread.joinable()) +@@ -468,11 +464,12 @@ struct GGNNGPUInstance { + CHECK_CUDA(cudaDeviceSynchronize()); + CHECK_CUDA(cudaPeekAtLastError()); + } ++ */ + + // graph operations + + template +- void queryLayer(const int shard_id = 0) const { ++ void queryLayer(const BaseT* d_query, int batch_size, KeyT* d_query_result_ids, ValueT* d_query_result_dists, const int shard_id = 0) const { + CHECK_CUDA(cudaSetDevice(gpu_id)); + const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size()); + +@@ -482,21 +479,21 @@ struct GGNNGPUInstance { + + int* m_dist_statistics = nullptr; + if (DIST_STATS) +- cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int)); ++ cudaMallocManaged(&m_dist_statistics, batch_size * sizeof(int)); + + QueryKernel query_kernel; + query_kernel.d_base = shard.d_base; +- query_kernel.d_query = ggnn_query.d_query; ++ query_kernel.d_query = d_query; + + query_kernel.d_graph = shard.d_graph; +- query_kernel.d_query_results = ggnn_query.d_query_result_ids; +- query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists; ++ query_kernel.d_query_results = d_query_result_ids; ++ query_kernel.d_query_results_dists = d_query_result_dists; + + query_kernel.d_translation = shard.d_translation; + + query_kernel.d_nn1_stats = shard.d_nn1_stats; + +- query_kernel.N = dataset->N_query; ++ query_kernel.N = batch_size; + query_kernel.N_offset = 0; + + query_kernel.d_dist_stats = m_dist_statistics; +@@ -771,6 +768,16 @@ struct GGNNGPUInstance { + sym(layer, shard_id); + } + } ++ ++ void set_stream(cudaStream_t stream) { ++ assert(ggnn_shards.size() == 1); ++ ggnn_shards.at(0).stream = stream; ++ } ++ ++ void set_base_data(const BaseT* dataset) { ++ assert(ggnn_shards.size() == 1); ++ ggnn_shards.at(0).d_base = dataset; ++ } + }; + + #endif // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_ +diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh +index c94a8f1..781226d 100644 +--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh ++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh +@@ -50,7 +50,7 @@ struct GGNNGraphDevice { + ValueT* d_nn1_stats; + + /// base data pointer for the shard. +- BaseT* d_base; ++ const BaseT* d_base; + + /// combined memory pool + char* d_memory; +@@ -69,7 +69,9 @@ struct GGNNGraphDevice { + const size_t selection_translation_size = align8(ST_all * sizeof(KeyT)); + const size_t nn1_stats_size = align8(2 * sizeof(ValueT)); + total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size; +- base_size = align8(static_cast(N) * D * sizeof(BaseT)); ++ // base_size = align8(static_cast(N) * D * sizeof(BaseT)); ++ (void) N; ++ (void) D; + + const size_t total_size = base_size+total_graph_size; + +@@ -86,8 +88,7 @@ struct GGNNGraphDevice { + CHECK_CUDA(cudaMalloc(&d_memory, total_size)); + + size_t pos = 0; +- d_base = reinterpret_cast(d_memory+pos); +- pos += base_size; ++ d_base = nullptr; + d_graph = reinterpret_cast(d_memory+pos); + pos += graph_size; + d_translation = reinterpret_cast(d_memory+pos); +@@ -99,14 +100,14 @@ struct GGNNGraphDevice { + + CHECK_EQ(pos, total_size); + +- CHECK_CUDA(cudaStreamCreate(&stream)); ++ // CHECK_CUDA(cudaStreamCreate(&stream)); + + CHECK_CUDA(cudaPeekAtLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); + CHECK_CUDA(cudaPeekAtLastError()); + } + +- GGNNGraphDevice(const GGNNGraphDevice& other) { ++ GGNNGraphDevice(const GGNNGraphDevice&) { + // this exists to allow using vector::emplace_back + // when it triggers a reallocation, this code will be called. + // always make sure that enough memory is reserved ahead of time. +@@ -116,7 +117,7 @@ struct GGNNGraphDevice { + ~GGNNGraphDevice() { + cudaFree(d_memory); + +- CHECK_CUDA(cudaStreamDestroy(stream)); ++ // CHECK_CUDA(cudaStreamDestroy(stream)); + } + }; + +diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh +index 2055f9e..ef5843a 100644 +--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh ++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh +@@ -92,7 +92,7 @@ struct GGNNGraphHost { + CHECK_CUDA(cudaPeekAtLastError()); + } + +- GGNNGraphHost(const GGNNGraphHost& other) { ++ GGNNGraphHost(const GGNNGraphHost&) { + // this exists to allow using vector::emplace_back + // when it triggers a reallocation, this code will be called. + // always make sure that enough memory is reserved ahead of time. +diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh +index 49d76a1..eef69e6 100644 +--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh ++++ b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh +@@ -22,7 +22,6 @@ limitations under the License. + #include + #include + +-#include + #include + + #include "ggnn/utils/cuda_knn_constants.cuh" diff --git a/cpp/cmake/patches/hnswlib.patch b/cpp/cmake/patches/hnswlib.patch new file mode 100644 index 000000000..32c1537c5 --- /dev/null +++ b/cpp/cmake/patches/hnswlib.patch @@ -0,0 +1,130 @@ +diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h +index e95e0b5..f0fe50a 100644 +--- a/hnswlib/hnswalg.h ++++ b/hnswlib/hnswalg.h +@@ -3,6 +3,7 @@ + #include "visited_list_pool.h" + #include "hnswlib.h" + #include ++#include + #include + #include + #include +@@ -16,6 +17,8 @@ namespace hnswlib { + template + class HierarchicalNSW : public AlgorithmInterface { + public: ++ bool base_layer_only{false}; ++ int num_seeds=32; + static const tableint max_update_element_locks = 65536; + HierarchicalNSW(SpaceInterface *s) { + } +@@ -56,7 +59,7 @@ namespace hnswlib { + visited_list_pool_ = new VisitedListPool(1, max_elements); + + //initializations for special treatment of the first node +- enterpoint_node_ = -1; ++ enterpoint_node_ = std::numeric_limits::max(); + maxlevel_ = -1; + + linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); +@@ -527,7 +530,7 @@ namespace hnswlib { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +@@ -1067,7 +1070,7 @@ namespace hnswlib { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_); + if (d < curdist) { +@@ -1119,28 +1122,41 @@ namespace hnswlib { + tableint currObj = enterpoint_node_; + dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); + +- for (int level = maxlevel_; level > 0; level--) { +- bool changed = true; +- while (changed) { +- changed = false; +- unsigned int *data; ++ if (base_layer_only) { ++ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale ++ for (int i = 0; i < num_seeds; i++) { ++ tableint obj = i * (max_elements_ / num_seeds); ++ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_); ++ if (dist < curdist) { ++ curdist = dist; ++ currObj = obj; ++ } ++ } ++ } ++ else{ ++ for (int level = maxlevel_; level > 0; level--) { ++ bool changed = true; ++ while (changed) { ++ changed = false; ++ unsigned int *data; + +- data = (unsigned int *) get_linklist(currObj, level); +- int size = getListCount(data); +- metric_hops++; +- metric_distance_computations+=size; ++ data = (unsigned int *) get_linklist(currObj, level); ++ int size = getListCount(data); ++ metric_hops++; ++ metric_distance_computations+=size; + +- tableint *datal = (tableint *) (data + 1); +- for (int i = 0; i < size; i++) { +- tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) +- throw std::runtime_error("cand error"); +- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); ++ tableint *datal = (tableint *) (data + 1); ++ for (int i = 0; i < size; i++) { ++ tableint cand = datal[i]; ++ if (cand > max_elements_) ++ throw std::runtime_error("cand error"); ++ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +- if (d < curdist) { +- curdist = d; +- currObj = cand; +- changed = true; ++ if (d < curdist) { ++ curdist = d; ++ currObj = cand; ++ changed = true; ++ } + } + } + } +diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h +index 5e1a4a5..4195ebd 100644 +--- a/hnswlib/visited_list_pool.h ++++ b/hnswlib/visited_list_pool.h +@@ -3,6 +3,7 @@ + #include + #include + #include ++#include + + namespace hnswlib { + typedef unsigned short int vl_type; +@@ -14,7 +15,7 @@ namespace hnswlib { + unsigned int numelements; + + VisitedList(int numelements1) { +- curV = -1; ++ curV = std::numeric_limits::max(); + numelements = numelements1; + mass = new vl_type[numelements]; + } diff --git a/cpp/cmake/patches/nlohmann_json.patch b/cpp/cmake/patches/nlohmann_json.patch new file mode 100644 index 000000000..83dd56bc1 --- /dev/null +++ b/cpp/cmake/patches/nlohmann_json.patch @@ -0,0 +1,38 @@ +--- nlohmann/json.hpp 2021-05-06 11:40:39.770669693 +0800 ++++ nlohmann/json_patched.hpp 2021-06-02 18:46:43.849334466 +0800 +@@ -16607,6 +16607,21 @@ + } + } + ++ ++ template ::value, int> = 0> ++ bool is_negative_number(NumberType x) ++ { ++ return x < 0; ++ } ++ ++ template < typename NumberType, ++ enable_if_t < std::is_unsigned::value, int > = 0 > ++ bool is_negative_number(NumberType /*unused*/) ++ { ++ return false; ++ } ++ + /*! + @brief dump an integer + +@@ -16649,12 +16664,11 @@ + // use a pointer to fill the buffer + auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg) + +- const bool is_negative = std::is_same::value && !(x >= 0); // see issue #755 + number_unsigned_t abs_value; + + unsigned int n_chars{}; + +- if (is_negative) ++ if (is_negative_number(x)) + { + *buffer_ptr = '-'; + abs_value = remove_sign(static_cast(x)); diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake new file mode 100644 index 000000000..0123c4b07 --- /dev/null +++ b/cpp/cmake/thirdparty/get_cutlass.cmake @@ -0,0 +1,92 @@ +# ============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +function(find_and_configure_cutlass) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + # if(RAFT_ENABLE_DIST_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES) + set(CUTLASS_ENABLE_HEADERS_ONLY + ON + CACHE BOOL "Enable only the header library" + ) + set(CUTLASS_NAMESPACE + "raft_cutlass" + CACHE STRING "Top level namespace of CUTLASS" + ) + set(CUTLASS_ENABLE_CUBLAS + OFF + CACHE BOOL "Disable CUTLASS to build with cuBLAS library." + ) + + if (CUDA_STATIC_RUNTIME) + set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE) + endif() + + rapids_cpm_find( + NvidiaCutlass ${PKG_VERSION} + GLOBAL_TARGETS nvidia::cutlass::cutlass + CPM_ARGS + GIT_REPOSITORY ${PKG_REPOSITORY} + GIT_TAG ${PKG_PINNED_TAG} + GIT_SHALLOW TRUE + OPTIONS "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}" + ) + + if(TARGET CUTLASS AND NOT TARGET nvidia::cutlass::cutlass) + add_library(nvidia::cutlass::cutlass ALIAS CUTLASS) + endif() + + if(NvidiaCutlass_ADDED) + rapids_export( + BUILD NvidiaCutlass + EXPORT_SET NvidiaCutlass + GLOBAL_TARGETS nvidia::cutlass::cutlass + NAMESPACE nvidia::cutlass:: + ) + endif() + # endif() + + # We generate the cutlass-config files when we built cutlass locally, so always do + # `find_dependency` + rapids_export_package( + BUILD NvidiaCutlass raft-exports GLOBAL_TARGETS nvidia::cutlass::cutlass + ) + rapids_export_package( + INSTALL NvidiaCutlass raft-exports GLOBAL_TARGETS nvidia::cutlass::cutlass + ) + + # Tell cmake where it can find the generated NvidiaCutlass-config.cmake we wrote. + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] + EXPORT_SET raft-exports + ) + rapids_export_find_package_root( + BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET raft-exports + ) +endfunction() + +if(NOT RAFT_CUTLASS_GIT_TAG) + set(RAFT_CUTLASS_GIT_TAG v2.10.0) +endif() + +if(NOT RAFT_CUTLASS_GIT_REPOSITORY) + set(RAFT_CUTLASS_GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git) +endif() + +find_and_configure_cutlass( + VERSION 2.10.0 REPOSITORY ${RAFT_CUTLASS_GIT_REPOSITORY} PINNED_TAG ${RAFT_CUTLASS_GIT_TAG} +) diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake new file mode 100644 index 000000000..85829554a --- /dev/null +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -0,0 +1,110 @@ +#============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_faiss) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_find_generate_module(faiss + HEADER_NAMES faiss/IndexFlat.h + LIBRARY_NAMES faiss + ) + + set(BUILD_SHARED_LIBS ON) + if (PKG_BUILD_STATIC_LIBS) + set(BUILD_SHARED_LIBS OFF) + set(CPM_DOWNLOAD_faiss ON) + endif() + + include(cmake/modules/FindAVX.cmake) + + # Link against AVX CPU lib if it exists + set(RAFT_FAISS_GLOBAL_TARGETS faiss::faiss) + set(RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss) + set(RAFT_FAISS_OPT_LEVEL "generic") + if(CXX_AVX_FOUND) + set(RAFT_FAISS_OPT_LEVEL "avx2") + list(APPEND RAFT_FAISS_GLOBAL_TARGETS faiss::faiss_avx2) + list(APPEND RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss_avx2) + endif() + + rapids_cpm_find(faiss ${PKG_VERSION} + GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} + CPM_ARGS + GIT_REPOSITORY ${PKG_REPOSITORY} + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + OPTIONS + "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}" + "FAISS_ENABLE_PYTHON OFF" + "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}" + "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" + "BUILD_TESTING OFF" + "CMAKE_MESSAGE_LOG_LEVEL VERBOSE" + ) + + if(TARGET faiss AND NOT TARGET faiss::faiss) + add_library(faiss::faiss ALIAS faiss) + endif() + + if(CXX_AVX_FOUND) + + if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2) + add_library(faiss::faiss_avx2 ALIAS faiss_avx2) + endif() + endif() + + + if(faiss_ADDED) + rapids_export(BUILD faiss + EXPORT_SET faiss-targets + GLOBAL_TARGETS ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS} + NAMESPACE faiss::) + endif() + + # We generate the faiss-config files when we built faiss locally, so always do `find_dependency` + rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it + rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) + rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) + + # Tell cmake where it can find the generated faiss-config.cmake we wrote. + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET raft-ann-bench-exports) +endfunction() + +if(NOT RAFT_FAISS_GIT_TAG) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk) + # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30) +endif() + +if(NOT RAFT_FAISS_GIT_REPOSITORY) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(RAFT_FAISS_GIT_REPOSITORY https://github.com/cjnolet/faiss.git) + # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git) +endif() + +find_and_configure_faiss(VERSION 1.7.4 + REPOSITORY ${RAFT_FAISS_GIT_REPOSITORY} + PINNED_TAG ${RAFT_FAISS_GIT_TAG} + BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC} + EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL} + ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU}) + diff --git a/cpp/cmake/thirdparty/get_fmt.cmake b/cpp/cmake/thirdparty/get_fmt.cmake new file mode 100644 index 000000000..c06f8a78b --- /dev/null +++ b/cpp/cmake/thirdparty/get_fmt.cmake @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone fmt +function(find_and_configure_fmt) + + include(${rapids-cmake-dir}/cpm/fmt.cmake) + rapids_cpm_fmt(INSTALL_EXPORT_SET rmm-exports BUILD_EXPORT_SET rmm-exports) +endfunction() + +find_and_configure_fmt() \ No newline at end of file diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake new file mode 100644 index 000000000..708acb6b8 --- /dev/null +++ b/cpp/cmake/thirdparty/get_ggnn.cmake @@ -0,0 +1,44 @@ +#============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_ggnn) + set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ ) + if (NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/ggnn-src/) + + execute_process ( + COMMAND git clone "https://github.com/${PKG_FORK}/ggnn" --branch ${PKG_PINNED_TAG} ggnn-src + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ ) + + message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}") + execute_process ( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.patch + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src + ) + endif() + +endfunction() + +# Change pinned tag here to test a commit in CI +# To use a different RAFT locally, set the CMake variable +# CPM_raft_SOURCE=/path/to/local/raft +find_and_configure_ggnn(VERSION 0.5 + FORK cgtuebingen + PINNED_TAG release_0.5 + EXCLUDE_FROM_ALL YES) diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake new file mode 100644 index 000000000..35a9170f9 --- /dev/null +++ b/cpp/cmake/thirdparty/get_glog.cmake @@ -0,0 +1,48 @@ +#============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_glog) + set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_cpm_find(glog ${PKG_VERSION} + GLOBAL_TARGETS glog::glog + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/glog.git + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + ) + + if(glog_ADDED) + message(VERBOSE "RAFT: Using glog located in ${glog_SOURCE_DIR}") + else() + message(VERBOSE "RAFT: Using glog located in ${glog_DIR}") + endif() + + +endfunction() + +# Change pinned tag here to test a commit in CI +# To use a different RAFT locally, set the CMake variable +# CPM_glog_SOURCE=/path/to/local/glog +find_and_configure_glog(VERSION 0.6.0 + FORK google + PINNED_TAG v0.6.0 + EXCLUDE_FROM_ALL ON + ) diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake new file mode 100644 index 000000000..34fca4c7d --- /dev/null +++ b/cpp/cmake/thirdparty/get_gtest.cmake @@ -0,0 +1,22 @@ +#============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_gtest ) + include(${rapids-cmake-dir}/cpm/gtest.cmake) + rapids_cpm_gtest() +endfunction() + +find_and_configure_gtest() diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake new file mode 100644 index 000000000..a4ceacae3 --- /dev/null +++ b/cpp/cmake/thirdparty/get_hnswlib.cmake @@ -0,0 +1,54 @@ +#============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_hnswlib) + set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) + if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src ) + + execute_process ( + COMMAND git clone --branch=v0.6.2 https://github.com/nmslib/hnswlib.git hnswlib-src + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps ) + + message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}") + execute_process ( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.patch + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src + ) + endif () + + include(cmake/modules/FindAVX.cmake) + + set(HNSW_CXX_FLAGS "") + if(CXX_AVX_FOUND) + set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX_FLAGS}") + elseif(CXX_AVX2_FOUND) + set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX2_FLAGS}") + elseif(CXX_AVX512_FOUND) + set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX512_FLAGS}") + endif() +endfunction() + +# Change pinned tag here to test a commit in CI +# To use a different RAFT locally, set the CMake variable +# CPM_raft_SOURCE=/path/to/local/raft +find_and_configure_hnswlib(VERSION 0.6.2 + FORK nmslib + PINNED_TAG v0.6.2 + EXCLUDE_FROM_ALL YES) diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake new file mode 100644 index 000000000..5de98a47c --- /dev/null +++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake @@ -0,0 +1,39 @@ +#============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_nlohmann_json) + set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_cpm_find(nlohmann_json ${PKG_VERSION} + GLOBAL_TARGETS nlohmann_json::nlohmann_json + BUILD_EXPORT_SET raft-bench-ann-exports + INSTALL_EXPORT_SET raft-bench-ann-exports + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/json.git + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}) + +endfunction() + +# Change pinned tag here to test a commit in CI +# To use a different RAFT locally, set the CMake variable +# CPM_raft_SOURCE=/path/to/local/raft +find_and_configure_nlohmann_json(VERSION 3.11.2 + FORK nlohmann + PINNED_TAG v3.11.2 + EXCLUDE_FROM_ALL YES) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake new file mode 100644 index 000000000..a303193bc --- /dev/null +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -0,0 +1,23 @@ +#============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_rmm) + include(${rapids-cmake-dir}/cpm/rmm.cmake) + rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports) +endfunction() + +find_and_configure_rmm() diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake new file mode 100644 index 000000000..7be7804c7 --- /dev/null +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -0,0 +1,33 @@ +# ============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone speedlog +function(find_and_configure_spdlog) + + include(${rapids-cmake-dir}/cpm/spdlog.cmake) + rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports) + rapids_export_package(BUILD spdlog rmm-exports) + + if(spdlog_ADDED) + rapids_export( + BUILD spdlog + EXPORT_SET spdlog + GLOBAL_TARGETS spdlog spdlog_header_only + NAMESPACE spdlog::) + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET rmm-exports) + endif() +endfunction() + +find_and_configure_spdlog() \ No newline at end of file diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake new file mode 100644 index 000000000..6e37aab40 --- /dev/null +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -0,0 +1,24 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone thrust +function(find_and_configure_thrust) + include(${rapids-cmake-dir}/cpm/thrust.cmake) + + rapids_cpm_thrust( NAMESPACE raft + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports) +endfunction() + +find_and_configure_thrust() diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile new file mode 100644 index 000000000..3eb0763ea --- /dev/null +++ b/cpp/doxygen/Doxyfile @@ -0,0 +1,2546 @@ +# Doxyfile 1.8.20 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "RAFT C++ API" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = "24.02" + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = cu=C++ \ + cuh=C++ + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which efficively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = NO + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# declarations. If set to NO, these declarations will be included in the +# documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# (including Cygwin) and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# The default value is: NO. + +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = YES + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = main_page.md \ + ../include + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen +# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, +# *.vhdl, *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = *.hpp \ + *.cuh + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = ../include/raft/sparse/linalg/symmetrize.hpp \ + ../include/raft/cache \ + ../include/raft/common \ + ../include/raft/lap \ + ../include/raft/sparse/selection \ + ../include/raft/sparse/csr.hpp \ + ../include/raft/linalg/lanczos.cuh \ + ../include/raft/linalg/lanczos.hpp \ + ../include/raft/util/cuda_utils.cuh \ + ../include/raft/util/cudart_utils.hpp \ + ../include/raft/util/device_atomics.cuh \ + ../include/raft/util/device_utils.cuh \ + ../include/raft/core/error.hpp \ + ../include/raft/core/handle.hpp \ + ../include/raft/util/integer_utils.hpp \ + ../include/raft/util/pow2_utils.cuh \ + ../include/raft/util/vectorized.cuh \ + ../include/raft/raft.hpp \ + ../include/raft/core/cudart_utils.hpp \ + ../include/raft/matrix/math.cuh \ + ../include/raft/matrix/matrix.cuh + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +# TODO: remove specializations from exclude patterns when headers have been removed. +EXCLUDE_PATTERNS = */detail/* \ + */specializations/* \ + */thirdparty/* + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = detail + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = main_page.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = NO + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = header.html + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 266 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 255 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 52 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/xcode/), introduced with OSX +# 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the main .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side JavaScript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /