diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 303a6c6b4..7d8bb0022 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -184,6 +184,8 @@ if(BUILD_MICRO_BENCH OR BUILD_ANN_BENCH) rapids_cpm_gbench() endif() +include(cmake/thirdparty/get_cutlass.cmake) + # ################################################################################################## # * cuvs --------------------------------------------------------------------- @@ -368,7 +370,7 @@ target_include_directories( if(NOT BUILD_CPU_ONLY) # Keep cuVS as lightweight as possible. Only CUDA libs and rmm should be used in global target. - target_link_libraries(cuvs PUBLIC raft::raft) + target_link_libraries(cuvs PUBLIC raft::raft nvidia::cutlass::cutlass) endif() # Endian detection diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake index 0123c4b07..61065318b 100644 --- a/cpp/cmake/thirdparty/get_cutlass.cmake +++ b/cpp/cmake/thirdparty/get_cutlass.cmake @@ -22,7 +22,7 @@ function(find_and_configure_cutlass) CACHE BOOL "Enable only the header library" ) set(CUTLASS_NAMESPACE - "raft_cutlass" + "cuvs_cutlass" CACHE STRING "Top level namespace of CUTLASS" ) set(CUTLASS_ENABLE_CUBLAS @@ -61,21 +61,21 @@ function(find_and_configure_cutlass) # We generate the cutlass-config files when we built cutlass locally, so always do # `find_dependency` rapids_export_package( - BUILD NvidiaCutlass raft-exports GLOBAL_TARGETS nvidia::cutlass::cutlass + BUILD NvidiaCutlass cuvs-exports GLOBAL_TARGETS nvidia::cutlass::cutlass ) rapids_export_package( - INSTALL NvidiaCutlass raft-exports GLOBAL_TARGETS nvidia::cutlass::cutlass + INSTALL NvidiaCutlass cuvs-exports GLOBAL_TARGETS nvidia::cutlass::cutlass ) # Tell cmake where it can find the generated NvidiaCutlass-config.cmake we wrote. include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] - EXPORT_SET raft-exports + EXPORT_SET cuvs-exports ) rapids_export_find_package_root( BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET raft-exports + EXPORT_SET cuvs-exports ) endfunction() diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake index 85829554a..718ac7c2d 100644 --- a/cpp/cmake/thirdparty/get_faiss.cmake +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -77,14 +77,14 @@ function(find_and_configure_faiss) endif() # We generate the faiss-config files when we built faiss locally, so always do `find_dependency` - rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it - rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) - rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) + rapids_export_package(BUILD OpenMP cuvs-ann-bench-exports) # faiss uses openMP but doesn't export a need for it + rapids_export_package(BUILD faiss cuvs-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) + rapids_export_package(INSTALL faiss cuvs-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) # Tell cmake where it can find the generated faiss-config.cmake we wrote. include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET raft-ann-bench-exports) + EXPORT_SET cuvs-ann-bench-exports) endfunction() if(NOT RAFT_FAISS_GIT_TAG) diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake index 708acb6b8..883fbd68c 100644 --- a/cpp/cmake/thirdparty/get_ggnn.cmake +++ b/cpp/cmake/thirdparty/get_ggnn.cmake @@ -37,7 +37,7 @@ endfunction() # Change pinned tag here to test a commit in CI # To use a different RAFT locally, set the CMake variable -# CPM_raft_SOURCE=/path/to/local/raft +# CPM_cuvs_SOURCE=/path/to/local/cuvs find_and_configure_ggnn(VERSION 0.5 FORK cgtuebingen PINNED_TAG release_0.5 diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake index 35a9170f9..56ed45274 100644 --- a/cpp/cmake/thirdparty/get_glog.cmake +++ b/cpp/cmake/thirdparty/get_glog.cmake @@ -21,8 +21,8 @@ function(find_and_configure_glog) rapids_cpm_find(glog ${PKG_VERSION} GLOBAL_TARGETS glog::glog - BUILD_EXPORT_SET raft-exports - INSTALL_EXPORT_SET raft-exports + BUILD_EXPORT_SET cuvs-exports + INSTALL_EXPORT_SET cuvs-exports CPM_ARGS GIT_REPOSITORY https://github.com/${PKG_FORK}/glog.git GIT_TAG ${PKG_PINNED_TAG} diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake index a4ceacae3..2f0b07f1d 100644 --- a/cpp/cmake/thirdparty/get_hnswlib.cmake +++ b/cpp/cmake/thirdparty/get_hnswlib.cmake @@ -47,7 +47,7 @@ endfunction() # Change pinned tag here to test a commit in CI # To use a different RAFT locally, set the CMake variable -# CPM_raft_SOURCE=/path/to/local/raft +# CPM_cuvs_SOURCE=/path/to/local/cuvs find_and_configure_hnswlib(VERSION 0.6.2 FORK nmslib PINNED_TAG v0.6.2 diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake index 5de98a47c..9827b7ee6 100644 --- a/cpp/cmake/thirdparty/get_nlohmann_json.cmake +++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake @@ -21,8 +21,8 @@ function(find_and_configure_nlohmann_json) rapids_cpm_find(nlohmann_json ${PKG_VERSION} GLOBAL_TARGETS nlohmann_json::nlohmann_json - BUILD_EXPORT_SET raft-bench-ann-exports - INSTALL_EXPORT_SET raft-bench-ann-exports + BUILD_EXPORT_SET cuvs-bench-exports + INSTALL_EXPORT_SET cuvs-bench-exports CPM_ARGS GIT_REPOSITORY https://github.com/${PKG_FORK}/json.git GIT_TAG ${PKG_PINNED_TAG} @@ -32,7 +32,7 @@ endfunction() # Change pinned tag here to test a commit in CI # To use a different RAFT locally, set the CMake variable -# CPM_raft_SOURCE=/path/to/local/raft +# CPM_cuvs_SOURCE=/path/to/local/cuvs find_and_configure_nlohmann_json(VERSION 3.11.2 FORK nlohmann PINNED_TAG v3.11.2 diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index 6128b5c43..d45be4aef 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -35,8 +35,8 @@ function(find_and_configure_raft) #----------------------------------------------------- rapids_cpm_find(raft ${PKG_VERSION} GLOBAL_TARGETS raft::raft - BUILD_EXPORT_SET raft-template-exports - INSTALL_EXPORT_SET raft-template-exports + BUILD_EXPORT_SET cuvs-template-exports + INSTALL_EXPORT_SET cuvs-template-exports COMPONENTS ${RAFT_COMPONENTS} CPM_ARGS GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index a303193bc..97796215e 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -16,8 +16,8 @@ function(find_and_configure_rmm) include(${rapids-cmake-dir}/cpm/rmm.cmake) - rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports - INSTALL_EXPORT_SET raft-exports) + rapids_cpm_rmm(BUILD_EXPORT_SET cuvs-exports + INSTALL_EXPORT_SET cuvs-exports) endfunction() find_and_configure_rmm() diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 6e37aab40..d2d6cb3e6 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -16,9 +16,9 @@ function(find_and_configure_thrust) include(${rapids-cmake-dir}/cpm/thrust.cmake) - rapids_cpm_thrust( NAMESPACE raft - BUILD_EXPORT_SET raft-exports - INSTALL_EXPORT_SET raft-exports) + rapids_cpm_thrust( NAMESPACE cuvs + BUILD_EXPORT_SET cuvs-exports + INSTALL_EXPORT_SET cuvs-exports) endfunction() find_and_configure_thrust() diff --git a/cpp/include/cuvs/distance/distance-inl.cuh b/cpp/include/cuvs/distance/distance-inl.cuh index 95d19e562..0abdeacff 100644 --- a/cpp/include/cuvs/distance/distance-inl.cuh +++ b/cpp/include/cuvs/distance/distance-inl.cuh @@ -409,7 +409,7 @@ void distance(raft::resources const& handle, RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous."); RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous."); - constexpr auto is_rowmajor = std::is_same_v; + constexpr auto is_rowmajor = std::is_same_v; distance(handle, x.data_handle(), @@ -454,7 +454,7 @@ void pairwise_distance(raft::resources const& handle, RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous."); RAFT_EXPECTS(dist.is_exhaustive(), "Output must be contiguous."); - constexpr auto rowmajor = std::is_same_v; + constexpr auto rowmajor = std::is_same_v; auto stream = raft::resource::get_cuda_stream(handle); rmm::device_uvector workspace(0, stream); diff --git a/cpp/include/cuvs/neighbors/brute_force-inl.cuh b/cpp/include/cuvs/neighbors/brute_force-inl.cuh index 88904dbae..b5584c5b7 100644 --- a/cpp/include/cuvs/neighbors/brute_force-inl.cuh +++ b/cpp/include/cuvs/neighbors/brute_force-inl.cuh @@ -303,7 +303,7 @@ index build( { // certain distance metrics can benefit by pre-calculating the norms for the index dataset // which lets us avoid calculating these at query time - std::optional> norms; + std::optional> norms; if (metric == cuvs::distance::DistanceType::L2Expanded || metric == cuvs::distance::DistanceType::L2SqrtExpanded || metric == cuvs::distance::DistanceType::CosineExpanded) { diff --git a/cpp/include/cuvs/neighbors/brute_force_types.hpp b/cpp/include/cuvs/neighbors/brute_force_types.hpp index 6e80496f2..0d3252d71 100644 --- a/cpp/include/cuvs/neighbors/brute_force_types.hpp +++ b/cpp/include/cuvs/neighbors/brute_force_types.hpp @@ -106,7 +106,7 @@ struct index : ann::index { { if (norms_) { norms_view_ = raft::make_const_mdspan(norms_.value().view()); } update_dataset(res, dataset); - resource::sync_stream(res); + raft::resource::sync_stream(res); } /** Construct a brute force index from dataset @@ -121,7 +121,7 @@ struct index : ann::index { T metric_arg = 0.0) : ann::index(), metric_(metric), - dataset_(make_device_matrix(res, 0, 0)), + dataset_(raft::make_device_matrix(res, 0, 0)), dataset_view_(dataset_view), norms_view_(norms_view), metric_arg_(metric_arg) diff --git a/cpp/include/cuvs/neighbors/detail/cagra/search_plan.cuh b/cpp/include/cuvs/neighbors/detail/cagra/search_plan.cuh index c4db1431a..f83418b5c 100644 --- a/cpp/include/cuvs/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/include/cuvs/neighbors/detail/cagra/search_plan.cuh @@ -91,7 +91,7 @@ struct search_plan_impl : public search_plan_impl_base { uint32_t topk) : search_plan_impl_base(params, dim, graph_degree, topk), hashmap(0, raft::resource::get_cuda_stream(res)), - num_executed_iterations(0, resource::get_cuda_stream(res)), + num_executed_iterations(0, raft::resource::get_cuda_stream(res)), dev_seed(0, raft::resource::get_cuda_stream(res)), num_seeds(0) { diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh index 1ddd610ff..9181fba79 100644 --- a/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh +++ b/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh @@ -191,7 +191,7 @@ struct BlockSelect { warpSortAnyRegisters(threadK, threadV); constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; - K raft::warpKRegisters[kNumWarpQRegisters]; + K warpKRegisters[kNumWarpQRegisters]; V warpVRegisters[kNumWarpQRegisters]; #pragma unroll @@ -200,18 +200,18 @@ struct BlockSelect { warpVRegisters[i] = warpV[i * raft::WarpSize + laneId]; } - warpFence(); + raft::warpFence(); // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing // one sorted list warpMergeAnyRegisters( - raft::warpKRegisters, warpVRegisters, threadK, threadV); + warpKRegisters, warpVRegisters, threadK, threadV); // Write back out the warp queue #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i * raft::WarpSize + laneId] = raft::warpKRegisters[i]; + warpK[i * raft::WarpSize + laneId] = warpKRegisters[i]; warpV[i * raft::WarpSize + laneId] = warpVRegisters[i]; } diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh index 7d6f98d52..0978c4c7f 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh @@ -47,9 +47,9 @@ template __device__ inline void copy_vectorized(T* out, const T* in, uint32_t n) { constexpr int VecElems = VecBytes / sizeof(T); // NOLINT - using align_bytes = Pow2<(size_t)VecBytes>; + using align_bytes = raft::Pow2<(size_t)VecBytes>; if constexpr (VecElems > 1) { - using align_elems = Pow2; + using align_elems = raft::Pow2; if (!align_bytes::areSameAlignOffsets(out, in)) { return copy_vectorized<(VecBytes >> 1), T>(out, in, n); } @@ -63,7 +63,7 @@ __device__ inline void copy_vectorized(T* out, const T* in, uint32_t n) } } { // process main part vectorized - using vec_t = typename IOType::Type; + using vec_t = typename raft::IOType::Type; copy_vectorized( reinterpret_cast(out), reinterpret_cast(in), align_elems::div(n)); } @@ -87,7 +87,7 @@ __device__ inline void copy_vectorized(T* out, const T* in, uint32_t n) * between them, and aggregate it using the provided Lambda; one structure per thread, per query, * and per index item. * - * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen) + * @tparam kUnroll elements per loop (normally, kUnroll = raft::WarpSize / Veclen) * @tparam Lambda computing the part of the distance for one dimension and aggregating it: * void (AccT& acc, AccT x, AccT y) * @tparam Veclen size of the vectorized load @@ -119,9 +119,9 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { T encV[Veclen]; - ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen); + raft::ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen); T queryRegs[Veclen]; - lds(queryRegs, &query_shared[shmemIndex + j * Veclen]); + raft::lds(queryRegs, &query_shared[shmemIndex + j * Veclen]); #pragma unroll for (int k = 0; k < Veclen; ++k) { compute_dist(dist, queryRegs[k], encV[k]); @@ -132,8 +132,8 @@ struct loadAndComputeDist { /** * Load parts of vectors from the index and query and accumulates the partial distance. * This version assumes the query is stored in the global memory and is different for every - * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into - * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once). + * thread. One warp loads exactly raft::WarpSize query elements at once and then reshuffles them + * into corresponding threads (`raft::WarpSize / (kUnroll * Veclen)` elements per thread at once). */ template __device__ __forceinline__ void runLoadShflAndCompute(const T*& data, @@ -143,18 +143,18 @@ struct loadAndComputeDist { { T queryReg = query[baseLoadIndex + lane_id]; constexpr int stride = kUnroll * Veclen; - constexpr int totalIter = WarpSize / stride; + constexpr int totalIter = raft::WarpSize / stride; constexpr int gmemStride = stride * kIndexGroupSize; #pragma unroll for (int i = 0; i < totalIter; ++i, data += gmemStride) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { T encV[Veclen]; - ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen); + raft::ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen); const int d = (i * kUnroll + j) * Veclen; #pragma unroll for (int k = 0; k < Veclen; ++k) { - compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]); + compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), encV[k]); } } } @@ -162,7 +162,7 @@ struct loadAndComputeDist { /** * Load parts of vectors from the index and query and accumulates the partial distance. - * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`. + * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `raft::WarpSize`. */ __device__ __forceinline__ void runLoadShflAndComputeRemainder( const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks) @@ -172,10 +172,10 @@ struct loadAndComputeDist { const int loadDataIdx = lane_id * Veclen; for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) { T enc[Veclen]; - ldg(enc, data + loadDataIdx); + raft::ldg(enc, data + loadDataIdx); #pragma unroll for (int k = 0; k < Veclen; k++) { - compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]); + compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), enc[k]); } } } @@ -202,8 +202,9 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV[veclen_int]; - ldg(encV, - reinterpret_cast(data) + loadIndex + j * kIndexGroupSize * veclen_int); + raft::ldg( + encV, + reinterpret_cast(data) + loadIndex + j * kIndexGroupSize * veclen_int); uint32_t queryRegs[veclen_int]; lds(queryRegs, reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); #pragma unroll @@ -223,16 +224,17 @@ struct loadAndComputeDist { constexpr int stride = kUnroll * uint8_veclen; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV[veclen_int]; - ldg(encV, - reinterpret_cast(data) + (lane_id + j * kIndexGroupSize) * veclen_int); + raft::ldg( + encV, + reinterpret_cast(data) + (lane_id + j * kIndexGroupSize) * veclen_int); const int d = (i * kUnroll + j) * veclen_int; #pragma unroll for (int k = 0; k < veclen_int; ++k) { - compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]); + compute_dist(dist, shfl(queryReg, d + k, raft::WarpSize), encV[k]); } } } @@ -250,10 +252,10 @@ struct loadAndComputeDist { for (int d = 0; d < dim - dimBlocks; d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) { uint32_t enc[veclen_int]; - ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); + raft::ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); #pragma unroll for (int k = 0; k < veclen_int; k++) { - uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize); + uint32_t q = shfl(queryReg, (d / 4) + k, raft::WarpSize); compute_dist(dist, q, enc[k]); } } @@ -295,11 +297,11 @@ struct loadAndComputeDist { constexpr int stride = kUnroll * veclen; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - uint32_t q = shfl(queryReg, i * kUnroll + j, WarpSize); + uint32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -316,7 +318,7 @@ struct loadAndComputeDist { uint32_t queryReg = loadDim < dim ? reinterpret_cast(query)[loadDim] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { uint32_t enc = reinterpret_cast(data)[lane_id]; - uint32_t q = shfl(queryReg, d / veclen, WarpSize); + uint32_t q = shfl(queryReg, d / veclen, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -356,11 +358,11 @@ struct loadAndComputeDist { constexpr int stride = kUnroll * veclen; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - uint32_t q = shfl(queryReg, i * kUnroll + j, WarpSize); + uint32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -377,7 +379,7 @@ struct loadAndComputeDist { uint32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { uint32_t enc = reinterpret_cast(data)[lane_id]; - uint32_t q = shfl(queryReg, d / veclen, WarpSize); + uint32_t q = shfl(queryReg, d / veclen, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -416,11 +418,11 @@ struct loadAndComputeDist { constexpr int stride = kUnroll * veclen; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV = data[lane_id + j * kIndexGroupSize]; - uint32_t q = shfl(queryReg, i * kUnroll + j, WarpSize); + uint32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -437,7 +439,7 @@ struct loadAndComputeDist { uint32_t queryReg = loadDim < dim ? query[loadDim] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { uint32_t enc = data[lane_id]; - uint32_t q = shfl(queryReg, d, WarpSize); + uint32_t q = shfl(queryReg, d, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -464,10 +466,12 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { int32_t encV[veclen_int]; - ldg(encV, - reinterpret_cast(data) + (loadIndex + j * kIndexGroupSize) * veclen_int); + raft::ldg( + encV, + reinterpret_cast(data) + (loadIndex + j * kIndexGroupSize) * veclen_int); int32_t queryRegs[veclen_int]; - lds(queryRegs, reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); + raft::lds(queryRegs, + reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); #pragma unroll for (int k = 0; k < veclen_int; k++) { compute_dist(dist, queryRegs[k], encV[k]); @@ -487,16 +491,17 @@ struct loadAndComputeDist { constexpr int stride = kUnroll * int8_veclen; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { int32_t encV[veclen_int]; - ldg(encV, - reinterpret_cast(data) + (lane_id + j * kIndexGroupSize) * veclen_int); + raft::ldg( + encV, + reinterpret_cast(data) + (lane_id + j * kIndexGroupSize) * veclen_int); const int d = (i * kUnroll + j) * veclen_int; #pragma unroll for (int k = 0; k < veclen_int; ++k) { - int32_t q = shfl(queryReg, d + k, WarpSize); + int32_t q = raft::shfl(queryReg, d + k, raft::WarpSize); compute_dist(dist, q, encV[k]); } } @@ -511,10 +516,10 @@ struct loadAndComputeDist { int32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) { int32_t enc[veclen_int]; - ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); + raft::ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); #pragma unroll for (int k = 0; k < veclen_int; k++) { - int32_t q = shfl(queryReg, (d / 4) + k, WarpSize); // Here 4 is for 1 - int; + int32_t q = shfl(queryReg, (d / 4) + k, raft::WarpSize); // Here 4 is for 1 - int; compute_dist(dist, q, enc[k]); } } @@ -553,11 +558,11 @@ struct loadAndComputeDist { constexpr int stride = kUnroll * veclen; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { int32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - int32_t q = shfl(queryReg, i * kUnroll + j, WarpSize); + int32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -571,7 +576,7 @@ struct loadAndComputeDist { int32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { int32_t enc = reinterpret_cast(data + lane_id * veclen)[0]; - int32_t q = shfl(queryReg, d / veclen, WarpSize); + int32_t q = shfl(queryReg, d / veclen, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -607,11 +612,12 @@ struct loadAndComputeDist { int32_t queryReg = query[baseLoadIndex + lane_id]; #pragma unroll - for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) { + for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { - compute_dist( - dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]); + compute_dist(dist, + raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize), + data[lane_id + j * kIndexGroupSize]); } } } @@ -622,7 +628,7 @@ struct loadAndComputeDist { const int loadDim = dimBlocks + lane_id; int32_t queryReg = loadDim < dim ? query[loadDim] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { - compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]); + compute_dist(dist, raft::shfl(queryReg, d, raft::WarpSize), data[lane_id]); } } }; @@ -637,7 +643,7 @@ struct loadAndComputeDist { * * @param compute_dist distance function * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a - * block; this number must be a multiple of `WarpSize * Veclen`. + * block; this number must be a multiple of `raft::WarpSize * Veclen`. * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim] * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes] * @param[in] list_indices index.indices @@ -705,7 +711,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) block_sort_t queue(k); { - using align_warp = Pow2; + using align_warp = Pow2; const int lane_id = align_warp::mod(threadIdx.x); // How many full warps needed to compute the distance (without remainder) @@ -725,9 +731,9 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) const uint32_t num_groups = align_warp::div(list_length + align_warp::Mask); // raft::ceildiv by power of 2 - constexpr int kUnroll = WarpSize / Veclen; - constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize; - // Every warp reads WarpSize vectors and computes the distances to them. + constexpr int kUnroll = raft::WarpSize / Veclen; + constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize; + // Every warp reads raft::WarpSize vectors and computes the distances to them. // Then, the distances and corresponding ids are distributed among the threads, // and each thread adds one (id, dist) pair to the filtering queue. for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups; @@ -737,7 +743,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim; // This is the vector a given lane/thread handles - const uint32_t vec_id = group_id * WarpSize + lane_id; + const uint32_t vec_id = group_id * raft::WarpSize + lane_id; const bool valid = vec_id < list_length && sample_filter(queries_offset + blockIdx.y, list_id, vec_id); @@ -746,7 +752,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) loadAndComputeDist lc(dist, compute_dist); for (int pos = 0; pos < shm_assisted_dim; - pos += WarpSize, data += kIndexGroupSize * WarpSize) { + pos += raft::WarpSize, data += kIndexGroupSize * raft::WarpSize) { lc.runLoadShmemCompute(data, query_shared, lane_id, pos); } } @@ -755,7 +761,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) // The default path - using shfl ops - for dimensions beyond query_smem_elems loadAndComputeDist lc(dist, compute_dist); - for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) { + for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += raft::WarpSize) { lc.runLoadShflAndCompute(data, query, pos, lane_id); } lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim); @@ -840,9 +846,9 @@ void launch_kernel(Lambda lambda, PostLambda>; const int max_query_smem = 16384; int query_smem_elems = - std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim())); + std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim())); int smem_size = query_smem_elems * sizeof(T); - constexpr int kSubwarpSize = std::min(Capacity, WarpSize); + constexpr int kSubwarpSize = std::min(Capacity, raft::WarpSize); auto block_merge_mem = raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide( kThreadsPerBlock / kSubwarpSize, k); @@ -936,7 +942,7 @@ struct inner_prod_dist { __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y) { if constexpr (Veclen > 1 && (std::is_same_v || std::is_same_v)) { - acc = dp4a(x, y, acc); + acc = raft::dp4a(x, y, acc); } else { acc += x * y; } @@ -1000,7 +1006,7 @@ template (1, 16 / sizeof(T))> struct select_interleaved_scan_kernel { /** @@ -1097,7 +1103,7 @@ void ivfflat_interleaved_scan(const index& index, uint32_t& grid_dim_x, rmm::cuda_stream_view stream) { - const int capacity = bound_by_power_of_two(k); + const int capacity = raft::bound_by_power_of_two(k); auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter( index.inds_ptrs().data_handle(), sample_filter); diff --git a/cpp/include/cuvs/neighbors/detail/knn_brute_force.cuh b/cpp/include/cuvs/neighbors/detail/knn_brute_force.cuh index 2b75eb048..6914ea030 100644 --- a/cpp/include/cuvs/neighbors/detail/knn_brute_force.cuh +++ b/cpp/include/cuvs/neighbors/detail/knn_brute_force.cuh @@ -76,8 +76,8 @@ void tiled_brute_force_knn(const raft::resources& handle, // Figure out the number of rows/cols to tile for size_t tile_rows = 0; size_t tile_cols = 0; - auto stream = resource::get_cuda_stream(handle); - auto device_memory = resource::get_workspace_resource(handle); + auto stream = raft::resource::get_cuda_stream(handle); + auto device_memory = raft::resource::get_workspace_resource(handle); auto total_mem = device_memory->get_mem_info(stream).second; faiss_select::chooseTileSize(m, n, d, sizeof(ElementType), total_mem, tile_rows, tile_cols); @@ -236,7 +236,7 @@ void tiled_brute_force_knn(const raft::resources& handle, } } - matrix::select_k( + raft::matrix::select_k( handle, raft::make_device_matrix_view( temp_distances.data(), current_query_size, current_centroid_size), @@ -263,7 +263,7 @@ void tiled_brute_force_knn(const raft::resources& handle, IndexType* out_indices = temp_out_indices.data(); auto count = thrust::make_counting_iterator(0); - thrust::for_each(resource::get_thrust_policy(handle), + thrust::for_each(raft::resource::get_thrust_policy(handle), count, count + current_query_size * current_k, [=] __device__(IndexType i) { @@ -278,7 +278,7 @@ void tiled_brute_force_knn(const raft::resources& handle, if (tile_cols != n) { // select the actual top-k items here from the temporary output - matrix::select_k( + raft::matrix::select_k( handle, raft::make_device_matrix_view( temp_out_distances.data(), current_query_size, temp_out_cols), diff --git a/cpp/include/cuvs/neighbors/detail/knn_brute_force_batch_k_query.cuh b/cpp/include/cuvs/neighbors/detail/knn_brute_force_batch_k_query.cuh index f4961ff3a..8d6dce407 100644 --- a/cpp/include/cuvs/neighbors/detail/knn_brute_force_batch_k_query.cuh +++ b/cpp/include/cuvs/neighbors/detail/knn_brute_force_batch_k_query.cuh @@ -86,9 +86,9 @@ class gpu_batch_k_query : public batch_k_query { if (!num_queries || !batch_size) { return; } - matrix::slice_coordinates coords{0, offset, num_queries, offset + batch_size}; - matrix::slice(this->res, input.indices(), output->indices(), coords); - matrix::slice(this->res, input.distances(), output->distances(), coords); + raft::matrix::slice_coordinates coords{0, offset, num_queries, offset + batch_size}; + raft::matrix::slice(this->res, input.indices(), output->indices(), coords); + raft::matrix::slice(this->res, input.distances(), output->distances(), coords); } const cuvs::neighbors::brute_force::index& index; diff --git a/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh b/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh index 54b97f562..555f7b258 100644 --- a/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh +++ b/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh @@ -68,7 +68,7 @@ RAFT_KERNEL knn_merge_parts_kernel(const value_t* inK, const value_t* inKStart = inK + (row_idx + col); const value_idx* inVStart = inV + (row_idx + col); - int limit = Pow2::roundDown(total_k); + int limit = raft::Pow2::roundDown(total_k); value_idx translation = 0; for (; i < limit; i += tpb) { diff --git a/cpp/include/cuvs/neighbors/ivf_flat_types.hpp b/cpp/include/cuvs/neighbors/ivf_flat_types.hpp index e9c452f5c..4863805fa 100644 --- a/cpp/include/cuvs/neighbors/ivf_flat_types.hpp +++ b/cpp/include/cuvs/neighbors/ivf_flat_types.hpp @@ -204,7 +204,7 @@ struct index : ann::index { * NB: this may be empty if the index is empty or if the metric does not require the center norms * calculation. */ - inline auto center_norms() noexcept -> std::optional> + inline auto center_norms() noexcept -> std::optional> { if (center_norms_.has_value()) { return std::make_optional>(center_norms_->view()); @@ -213,10 +213,11 @@ struct index : ann::index { } } [[nodiscard]] inline auto center_norms() const noexcept - -> std::optional> + -> std::optional> { if (center_norms_.has_value()) { - return std::make_optional>(center_norms_->view()); + return std::make_optional>( + center_norms_->view()); } else { return std::nullopt; } @@ -251,12 +252,12 @@ struct index : ann::index { metric_(metric), adaptive_centers_(adaptive_centers), conservative_memory_allocation_{conservative_memory_allocation}, - centers_(make_device_matrix(res, n_lists, dim)), + centers_(raft::make_device_matrix(res, n_lists, dim)), center_norms_(std::nullopt), lists_{n_lists}, - list_sizes_{make_device_vector(res, n_lists)}, - data_ptrs_{make_device_vector(res, n_lists)}, - inds_ptrs_{make_device_vector(res, n_lists)}, + list_sizes_{raft::make_device_vector(res, n_lists)}, + data_ptrs_{raft::make_device_vector(res, n_lists)}, + inds_ptrs_{raft::make_device_vector(res, n_lists)}, total_size_{0} { check_consistency(); @@ -366,7 +367,7 @@ struct index : ann::index { std::vector>> lists_; raft::device_vector list_sizes_; raft::device_matrix centers_; - std::optional> center_norms_; + std::optional> center_norms_; // Computed members raft::device_vector data_ptrs_; diff --git a/cpp/include/cuvs/neighbors/ivf_list_types.hpp b/cpp/include/cuvs/neighbors/ivf_list_types.hpp index 67dfba807..8d57971a2 100644 --- a/cpp/include/cuvs/neighbors/ivf_list_types.hpp +++ b/cpp/include/cuvs/neighbors/ivf_list_types.hpp @@ -46,9 +46,9 @@ struct list { using list_extents = typename spec_type::list_extents; /** Possibly encoded data; it's layout is defined by `SpecT`. */ - device_mdarray data; + raft::device_mdarray data; /** Source indices. */ - device_mdarray, raft::row_major> indices; + raft::device_mdarray, raft::row_major> indices; /** The actual size of the content. */ std::atomic size; diff --git a/cpp/include/cuvs/spatial/knn/detail/haversine_distance.cuh b/cpp/include/cuvs/spatial/knn/detail/haversine_distance.cuh index 44fa210b8..992d27912 100644 --- a/cpp/include/cuvs/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/cuvs/spatial/knn/detail/haversine_distance.cuh @@ -70,7 +70,7 @@ RAFT_KERNEL haversine_knn_kernel(value_idx* out_inds, std::numeric_limits::max(), std::numeric_limits::max(), smemK, smemV, k); // Grid is exactly sized to rows available - int limit = Pow2::roundDown(n_index_rows); + int limit = raft::Pow2::roundDown(n_index_rows); const value_t* query_ptr = query + (blockIdx.x * 2); value_t x1 = query_ptr[0];