diff --git a/cpp/include/cuvs/cluster/detail/kmeans.cuh b/cpp/include/cuvs/cluster/detail/kmeans.cuh
index b192973ad..1ed9f4ccd 100644
--- a/cpp/include/cuvs/cluster/detail/kmeans.cuh
+++ b/cpp/include/cuvs/cluster/detail/kmeans.cuh
@@ -66,7 +66,7 @@ void initRandom(raft::resources const& handle,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 raft::device_matrix_view<DataT, IndexT> centroids)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("initRandom");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("initRandom");
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_clusters     = params.n_clusters;
   detail::shuffleAndGather<DataT, IndexT>(handle, X, centroids, n_clusters, params.rng_state.seed);
@@ -93,7 +93,7 @@ void kmeansPlusPlus(raft::resources const& handle,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                     rmm::device_uvector<char>& workspace)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeansPlusPlus");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeansPlusPlus");
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -367,7 +367,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      raft::host_scalar_view<IndexT> n_iter,
                      rmm::device_uvector<char>& workspace)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
   logger::get(RAFT_NAME).set_level(params.verbosity);
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -524,7 +524,7 @@ void kmeans_fit_main(raft::resources const& handle,
                                                       workspace);
 
   // TODO: add different templates for InType of binaryOp to avoid thrust transform
-  thrust::transform(resource::get_thrust_policy(handle),
+  thrust::transform(raft::resource::get_thrust_policy(handle),
                     minClusterAndDistance.data_handle(),
                     minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
                     weight.data_handle(),
@@ -581,7 +581,8 @@ void initScalableKMeansPlusPlus(raft::resources const& handle,
                                 raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                                 rmm::device_uvector<char>& workspace)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("initScalableKMeansPlusPlus");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+    "initScalableKMeansPlusPlus");
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -826,7 +827,7 @@ void kmeans_fit(raft::resources const& handle,
                 raft::host_scalar_view<DataT> inertia,
                 raft::host_scalar_view<IndexT> n_iter)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_fit");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   auto n_clusters     = params.n_clusters;
@@ -872,7 +873,7 @@ void kmeans_fit(raft::resources const& handle,
   if (sample_weight.has_value())
     raft::copy(weight.data_handle(), sample_weight.value().data_handle(), n_samples, stream);
   else
-    thrust::fill(resource::get_thrust_policy(handle),
+    thrust::fill(raft::resource::get_thrust_policy(handle),
                  weight.data_handle(),
                  weight.data_handle() + weight.size(),
                  1);
@@ -993,7 +994,7 @@ void kmeans_predict(raft::resources const& handle,
                     bool normalize_weight,
                     raft::host_scalar_view<DataT> inertia)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_predict");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_predict");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   cudaStream_t stream = resource::get_cuda_stream(handle);
@@ -1019,7 +1020,7 @@ void kmeans_predict(raft::resources const& handle,
   if (sample_weight.has_value())
     raft::copy(weight.data_handle(), sample_weight.value().data_handle(), n_samples, stream);
   else
-    thrust::fill(resource::get_thrust_policy(handle),
+    thrust::fill(raft::resource::get_thrust_policy(handle),
                  weight.data_handle(),
                  weight.data_handle() + weight.size(),
                  1);
@@ -1065,7 +1066,7 @@ void kmeans_predict(raft::resources const& handle,
   // calculate cluster cost phi_x(C)
   rmm::device_scalar<DataT> clusterCostD(stream);
   // TODO: add different templates for InType of binaryOp to avoid thrust transform
-  thrust::transform(resource::get_thrust_policy(handle),
+  thrust::transform(raft::resource::get_thrust_policy(handle),
                     minClusterAndDistance.data_handle(),
                     minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
                     weight.data_handle(),
@@ -1084,7 +1085,7 @@ void kmeans_predict(raft::resources const& handle,
                              raft::value_op{},
                              raft::add_op{});
 
-  thrust::transform(resource::get_thrust_policy(handle),
+  thrust::transform(raft::resource::get_thrust_policy(handle),
                     minClusterAndDistance.data_handle(),
                     minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
                     labels.data_handle(),
@@ -1135,7 +1136,7 @@ void kmeans_fit_predict(raft::resources const& handle,
                         raft::host_scalar_view<DataT> inertia,
                         raft::host_scalar_view<IndexT> n_iter)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_fit_predict");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_predict");
   if (!centroids.has_value()) {
     auto n_features = X.extent(1);
     auto centroids_matrix =
@@ -1199,7 +1200,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<const DataT> centroids,
                       raft::device_matrix_view<DataT> X_new)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_transform");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_transform");
   logger::get(RAFT_NAME).set_level(params.verbosity);
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
diff --git a/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh b/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh
index b774a1b8f..1b946cc1e 100644
--- a/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh
@@ -104,7 +104,7 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
       auto minClusterAndDistance = raft::make_device_mdarray<raft::KeyValuePair<IdxT, MathT>, IdxT>(
         handle, mr, make_extents<IdxT>(n_rows));
       raft::KeyValuePair<IdxT, MathT> initial_value(0, std::numeric_limits<MathT>::max());
-      thrust::fill(resource::get_thrust_policy(handle),
+      thrust::fill(raft::resource::get_thrust_policy(handle),
                    minClusterAndDistance.data_handle(),
                    minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
                    initial_value);
@@ -130,7 +130,7 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
 
       // todo(lsugy): use KVP + iterator in caller.
       // Copy keys to output labels
-      thrust::transform(resource::get_thrust_policy(handle),
+      thrust::transform(raft::resource::get_thrust_policy(handle),
                         minClusterAndDistance.data_handle(),
                         minClusterAndDistance.data_handle() + n_rows,
                         labels,
@@ -325,7 +325,7 @@ void compute_norm(const raft::resources& handle,
                   MappingOpT mapping_op,
                   rmm::mr::device_memory_resource* mr = nullptr)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("compute_norm");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("compute_norm");
   auto stream = resource::get_cuda_stream(handle);
   if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
   rmm::device_uvector<MathT> mapped_dataset(0, stream, mr);
@@ -381,7 +381,7 @@ void predict(const raft::resources& handle,
              const MathT* dataset_norm           = nullptr)
 {
   auto stream = resource::get_cuda_stream(handle);
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
   auto [max_minibatch_size, _mem_per_row] =
@@ -473,7 +473,7 @@ __launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL
   const MathT wc = min(static_cast<MathT>(csize), static_cast<MathT>(kAdjustCentersWeight));
   // Weight for the datapoint used to shift the center.
   const MathT wd = 1.0;
-  for (; j < dim; j += WarpSize) {
+  for (; j < dim; j += raft::WarpSize) {
     MathT val = 0;
     val += wc * centers[j + dim * li];
     val += wd * mapping_op(dataset[j + dim * i]);
@@ -533,7 +533,7 @@ auto adjust_centers(MathT* centers,
                     rmm::cuda_stream_view stream,
                     rmm::mr::device_memory_resource* device_memory) -> bool
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   if (n_clusters == 0) { return false; }
   constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
@@ -901,7 +901,7 @@ auto build_fine_clusters(const raft::resources& handle,
     raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
     if (params.metric == cuvs::distance::DistanceType::L2Expanded ||
         params.metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
-      thrust::gather(resource::get_thrust_policy(handle),
+      thrust::gather(raft::resource::get_thrust_policy(handle),
                      mc_trainset_ids,
                      mc_trainset_ids + k,
                      dataset_norm_mptr,
@@ -964,7 +964,7 @@ void build_hierarchical(const raft::resources& handle,
   auto stream  = resource::get_cuda_stream(handle);
   using LabelT = uint32_t;
 
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
 
   IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
diff --git a/cpp/include/cuvs/cluster/detail/kmeans_common.cuh b/cpp/include/cuvs/cluster/detail/kmeans_common.cuh
index 965d8a549..d4f6a43a2 100644
--- a/cpp/include/cuvs/cluster/detail/kmeans_common.cuh
+++ b/cpp/include/cuvs/cluster/detail/kmeans_common.cuh
@@ -267,7 +267,7 @@ void sampleCentroids(raft::resources const& handle,
   resource::sync_stream(handle, stream);
 
   uint8_t* rawPtr_isSampleCentroid = isSampleCentroid.data_handle();
-  thrust::for_each_n(resource::get_thrust_policy(handle),
+  thrust::for_each_n(raft::resource::get_thrust_policy(handle),
                      sampledMinClusterDistance.data_handle(),
                      nPtsSampledInRank,
                      [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> val) {
@@ -399,7 +399,7 @@ void minClusterAndDistanceCompute(
 
   raft::KeyValuePair<IndexT, DataT> initial_value(0, std::numeric_limits<DataT>::max());
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                minClusterAndDistance.data_handle(),
                minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
                initial_value);
@@ -527,7 +527,7 @@ void minClusterDistanceCompute(raft::resources const& handle,
   auto pairwiseDistance = raft::make_device_matrix_view<DataT, IndexT>(
     L2NormBuf_OR_DistBuf.data(), dataBatchSize, centroidsBatchSize);
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                minClusterDistance.data_handle(),
                minClusterDistance.data_handle() + minClusterDistance.size(),
                std::numeric_limits<DataT>::max());
diff --git a/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh b/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh
index f5c920e08..0c2548863 100644
--- a/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh
@@ -101,7 +101,7 @@ void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t s
 }
 
 // TODO: specialize this function for MinAndDistanceReduceOp<int, float>
-// with atomicCAS of 64 bit which will eliminate mutex and shfls
+// with atomicCAS of 64 bit which will eliminate mutex and raft::shfls
 template <typename P, typename OutT, typename IdxT, typename KVPair, typename ReduceOpT>
 DI void updateReducedVal(
   int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY)
@@ -204,7 +204,7 @@ __launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min,
 #pragma unroll
         for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
           // Actually, the srcLane (lid +j) should be (lid +j) % P:AccThCols,
-          // but the shfl op applies the modulo internally.
+          // but the raft::shfl op applies the modulo internally.
           auto tmpkey   = raft::shfl(val[i].key, lid + j, P::AccThCols);
           auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols);
           KVPair tmp    = {tmpkey, tmpvalue};
diff --git a/cpp/include/cuvs/distance/detail/masked_nn.cuh b/cpp/include/cuvs/distance/detail/masked_nn.cuh
index b5a65407b..8b30d8eec 100644
--- a/cpp/include/cuvs/distance/detail/masked_nn.cuh
+++ b/cpp/include/cuvs/distance/detail/masked_nn.cuh
@@ -255,7 +255,7 @@ void masked_l2_nn_impl(raft::resources const& handle,
 
   // Get stream and workspace memory resource
   rmm::mr::device_memory_resource* ws_mr =
-    dynamic_cast<rmm::mr::device_memory_resource*>(resource::get_workspace_resource(handle));
+    dynamic_cast<rmm::mr::device_memory_resource*>(raft::resource::get_workspace_resource(handle));
   auto stream = resource::get_cuda_stream(handle);
 
   // Acquire temporary buffers and initialize to zero:
diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh b/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh
index 990f845fd..57366dec9 100644
--- a/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh
@@ -222,7 +222,7 @@ struct PairwiseDistances : public BaseClass {
 
   DI void accumulate()
   {
-    // We have a separate ldsXY and accumulate_reg_tile outside the loop body,
+    // We have a separate raft::ldsXY and accumulate_reg_tile outside the loop body,
     // so that these separated calls can be interspersed with preceding and
     // following instructions, thereby hiding latency.
     this->ldsXY(0);
diff --git a/cpp/include/cuvs/distance/distance-ext.cuh b/cpp/include/cuvs/distance/distance-ext.cuh
index efba6eecf..fdbe6a971 100644
--- a/cpp/include/cuvs/distance/distance-ext.cuh
+++ b/cpp/include/cuvs/distance/distance-ext.cuh
@@ -132,7 +132,7 @@ void distance(raft::resources const& handle,
               raft::device_matrix_view<OutT, IdxT, layout> dist,
               DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
-template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
+template <typename Type, typename layout = raft::layout_c_contiguous, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
                        device_matrix_view<Type, IdxT, layout> const x,
                        device_matrix_view<Type, IdxT, layout> const y,
diff --git a/cpp/include/cuvs/neighbors/brute_force-inl.cuh b/cpp/include/cuvs/neighbors/brute_force-inl.cuh
index b5584c5b7..3d5c449a9 100644
--- a/cpp/include/cuvs/neighbors/brute_force-inl.cuh
+++ b/cpp/include/cuvs/neighbors/brute_force-inl.cuh
@@ -172,8 +172,8 @@ void knn(raft::resources const& handle,
   RAFT_EXPECTS(indices.extent(1) == distances.extent(1) && distances.extent(1),
                "Number of columns in output indices and distances matrices must the same");
 
-  bool rowMajorIndex = std::is_same_v<index_layout, layout_c_contiguous>;
-  bool rowMajorQuery = std::is_same_v<search_layout, layout_c_contiguous>;
+  bool rowMajorIndex = std::is_same_v<index_layout, raft::layout_c_contiguous>;
+  bool rowMajorQuery = std::is_same_v<search_layout, raft::layout_c_contiguous>;
 
   std::vector<value_t*> inputs;
   std::vector<matrix_idx> sizes;
diff --git a/cpp/include/cuvs/neighbors/cagra_types.hpp b/cpp/include/cuvs/neighbors/cagra_types.hpp
index 9dee5b72a..0299b78df 100644
--- a/cpp/include/cuvs/neighbors/cagra_types.hpp
+++ b/cpp/include/cuvs/neighbors/cagra_types.hpp
@@ -221,7 +221,7 @@ struct index : ann::index {
    * @endcode
    *   In the above example, we have passed a host dataset to build. The returned index will own a
    * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a
-   * device_mdspan to build, then it will only store a reference to it.
+   * raft::device_mdspan to build, then it will only store a reference to it.
    *
    * - Constructing index using existing knn-graph
    * @code{.cpp}
diff --git a/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh
index 2349d5943..399d0071b 100644
--- a/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh
@@ -54,10 +54,11 @@ void build_knn_graph(
                "Currently only L2Expanded metric is supported");
 
   uint32_t node_degree = knn_graph.extent(1);
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::build_graph(%zu, %zu, %u)",
-                                                            size_t(dataset.extent(0)),
-                                                            size_t(dataset.extent(1)),
-                                                            node_degree);
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+    "cagra::build_graph(%zu, %zu, %u)",
+    size_t(dataset.extent(0)),
+    size_t(dataset.extent(1)),
+    node_degree);
 
   if (!build_params) {
     build_params          = ivf_pq::index_params{};
diff --git a/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh
index 6680a8b4b..87d8876e3 100644
--- a/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh
@@ -122,7 +122,7 @@ void search_main(raft::resources const& res,
 
   if (params.max_queries == 0) { params.max_queries = queries.extent(0); }
 
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, index.dim());
 
   using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
diff --git a/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh
index c57f03bf4..019da84f3 100644
--- a/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh
@@ -49,7 +49,7 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
 
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
@@ -103,7 +103,8 @@ void serialize_to_hnswlib(raft::resources const& res,
                           std::ostream& os,
                           const index<T, IdxT>& index_)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::serialize_to_hnswlib");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+    "cagra::serialize_to_hnswlib");
   RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u",
                  static_cast<size_t>(index_.size()),
                  index_.dim());
@@ -233,7 +234,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 template <typename T, typename IdxT>
 auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::deserialize");
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::deserialize");
 
   char dtype_string[4];
   is.read(dtype_string, 4);
diff --git a/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh
index c1d550382..622a6a825 100644
--- a/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -204,8 +204,8 @@ void random_pickup(const DATA_T* const dataset_ptr,  // [dataset_size, dataset_d
 
 template <class INDEX_T>
 RAFT_KERNEL pickup_next_parents_kernel(
-  INDEX_T* const parent_candidates_ptr,        // [num_queries, lds]
-  const std::size_t lds,                       // (*) lds >= parent_candidates_size
+  INDEX_T* const parent_candidates_ptr,        // [num_queries, raft::lds]
+  const std::size_t raft::lds,                 // (*) raft::lds >= parent_candidates_size
   const std::uint32_t parent_candidates_size,  //
   INDEX_T* const visited_hashmap_ptr,          // [num_queries, 1 << hash_bitlen]
   const std::size_t hash_bitlen,
@@ -268,8 +268,8 @@ RAFT_KERNEL pickup_next_parents_kernel(
 }
 
 template <class INDEX_T>
-void pickup_next_parents(INDEX_T* const parent_candidates_ptr,  // [num_queries, lds]
-                         const std::size_t lds,                 // (*) lds >= parent_candidates_size
+void pickup_next_parents(INDEX_T* const parent_candidates_ptr,  // [num_queries, raft::lds]
+                         const std::size_t raft::lds,  // (*) raft::lds >= parent_candidates_size
                          const std::size_t parent_candidates_size,  //
                          const std::size_t num_queries,
                          INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
@@ -291,7 +291,7 @@ void pickup_next_parents(INDEX_T* const parent_candidates_ptr,  // [num_queries,
   }
   pickup_next_parents_kernel<INDEX_T>
     <<<num_queries, block_size, 0, cuda_stream>>>(parent_candidates_ptr,
-                                                  lds,
+                                                  raft::lds,
                                                   parent_candidates_size,
                                                   visited_hashmap_ptr,
                                                   hash_bitlen,
@@ -312,7 +312,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
   INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
   DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
-  const std::size_t lds,
+  const std::size_t raft::lds,
   const std::uint32_t search_width,
   const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
   const std::uint32_t data_dim,
@@ -395,7 +395,7 @@ void compute_distance_to_child_nodes(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
   INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
   DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
-  const std::size_t lds,
+  const std::size_t raft::lds,
   const uint32_t search_width,
   const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
   const std::uint32_t data_dim,
@@ -421,7 +421,7 @@ void compute_distance_to_child_nodes(
     <<<grid_size, block_size, 0, cuda_stream>>>(parent_node_list,
                                                 parent_candidates_ptr,
                                                 parent_distance_ptr,
-                                                lds,
+                                                raft::lds,
                                                 search_width,
                                                 dataset_ptr,
                                                 data_dim,
@@ -471,7 +471,7 @@ void remove_parent_bit(const std::uint32_t num_queries,
 template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
 RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr,
                                 DISTANCE_T* const result_distances_ptr,
-                                const std::size_t lds,
+                                const std::size_t raft::lds,
                                 const std::uint32_t result_buffer_size,
                                 const std::uint32_t num_queries,
                                 const INDEX_T query_id_offset,
@@ -482,7 +482,7 @@ RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr,
   if (tid >= result_buffer_size * num_queries) { return; }
   const auto i     = tid % result_buffer_size;
   const auto j     = tid / result_buffer_size;
-  const auto index = i + j * lds;
+  const auto index = i + j * raft::lds;
 
   if (result_indices_ptr[index] != ~index_msb_1_mask &&
       !sample_filter(query_id_offset + j, result_indices_ptr[index])) {
@@ -494,7 +494,7 @@ RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr,
 template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
 void apply_filter(INDEX_T* const result_indices_ptr,
                   DISTANCE_T* const result_distances_ptr,
-                  const std::size_t lds,
+                  const std::size_t raft::lds,
                   const std::uint32_t result_buffer_size,
                   const std::uint32_t num_queries,
                   const INDEX_T query_id_offset,
@@ -506,7 +506,7 @@ void apply_filter(INDEX_T* const result_indices_ptr,
 
   apply_filter_kernel<<<grid_size, block_size, 0, cuda_stream>>>(result_indices_ptr,
                                                                  result_distances_ptr,
-                                                                 lds,
+                                                                 raft::lds,
                                                                  result_buffer_size,
                                                                  num_queries,
                                                                  query_id_offset,
@@ -642,7 +642,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
       parent_node_list(0, resource::get_cuda_stream(res)),
       topk_hint(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res)),
-      terminate_flag(resource::get_cuda_stream(res))
+      terminate_flag(raft::resource::get_cuda_stream(res))
   {
     set_params(res);
   }
diff --git a/cpp/include/cuvs/neighbors/detail/div_utils.hpp b/cpp/include/cuvs/neighbors/detail/div_utils.hpp
index fc6b6f5c5..805bb1304 100644
--- a/cpp/include/cuvs/neighbors/detail/div_utils.hpp
+++ b/cpp/include/cuvs/neighbors/detail/div_utils.hpp
@@ -21,9 +21,9 @@
 #endif
 
 /**
- * @brief A simple wrapper for raft::Pow2 which uses Pow2 utils only when available and regular
- * integer division otherwise. This is done to allow a common interface for division arithmetic for
- * non CUDA headers.
+ * @brief A simple wrapper for raft::Pow2 which uses raft::Pow2 utils only when available and
+ * regular integer division otherwise. This is done to allow a common interface for division
+ * arithmetic for non CUDA headers.
  *
  * @tparam Value_ a compile-time value representable as a power-of-two.
  */
@@ -37,7 +37,7 @@ struct div_utils {
   static constexpr _RAFT_HOST_DEVICE inline auto roundDown(T x)
   {
 #if defined(_RAFT_HAS_CUDA)
-    return Pow2<Value_>::roundDown(x);
+    return raft::Pow2<Value_>::roundDown(x);
 #else
     return raft::round_down_safe(x, Value_);
 #endif
@@ -47,7 +47,7 @@ struct div_utils {
   static constexpr _RAFT_HOST_DEVICE inline auto mod(T x)
   {
 #if defined(_RAFT_HAS_CUDA)
-    return Pow2<Value_>::mod(x);
+    return raft::Pow2<Value_>::mod(x);
 #else
     return x % Value_;
 #endif
@@ -57,7 +57,7 @@ struct div_utils {
   static constexpr _RAFT_HOST_DEVICE inline auto div(T x)
   {
 #if defined(_RAFT_HAS_CUDA)
-    return Pow2<Value_>::div(x);
+    return raft::Pow2<Value_>::div(x);
 #else
     return x / Value_;
 #endif
diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh
index a4c895d0d..cf97d99ca 100644
--- a/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh
+++ b/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh
@@ -138,7 +138,7 @@ inline __device__ void warpBitonicMergeLE16(K& k, V& v)
 
 // Template for performing a bitonic merge of an arbitrary set of
 // registers
-template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool raft::Pow2>
 struct BitonicMergeStep {};
 
 //
diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh
index 9181fba79..796a841a4 100644
--- a/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh
+++ b/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh
@@ -127,7 +127,7 @@ struct BlockSelect {
       warpV[i] = initV;
     }
 
-    warpFence();
+    raft::warpFence();
   }
 
   __device__ inline void addThreadQ(K k, V v)
@@ -161,7 +161,7 @@ struct BlockSelect {
       return;
     }
 
-    // This has a trailing warpFence
+    // This has a trailing raft::warpFence
     mergeWarpQ();
 
     // Any top-k elements have been merged into the warp queue; we're
@@ -177,7 +177,7 @@ struct BlockSelect {
     // We have to beat at least this element
     warpKTop = warpK[kMinus1];
 
-    warpFence();
+    raft::warpFence();
   }
 
   /// This function handles sorting and merging together the
@@ -215,7 +215,7 @@ struct BlockSelect {
       warpV[i * raft::WarpSize + laneId] = warpVRegisters[i];
     }
 
-    warpFence();
+    raft::warpFence();
   }
 
   /// WARNING: all threads in a warp must participate in this.
diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh
index 4407473a8..14484435b 100644
--- a/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh
+++ b/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh
@@ -65,7 +65,7 @@ struct KeyValueBlockSelect {
       warpV[i].value = initVv;
     }
 
-    warpFence();
+    raft::warpFence();
   }
 
   __device__ inline void addThreadQ(K k, K vk, V vv)
@@ -101,7 +101,7 @@ struct KeyValueBlockSelect {
       return;
     }
 
-    // This has a trailing warpFence
+    // This has a trailing raft::warpFence
     mergeWarpQ();
 
     // Any top-k elements have been merged into the warp queue; we're
@@ -119,7 +119,7 @@ struct KeyValueBlockSelect {
     warpKTop      = warpK[kMinus1];
     warpKTopRDist = warpV[kMinus1].key;
 
-    warpFence();
+    raft::warpFence();
   }
 
   /// This function handles sorting and merging together the
@@ -143,7 +143,7 @@ struct KeyValueBlockSelect {
       warpVRegisters[i].value = warpV[i * raft::WarpSize + laneId].value;
     }
 
-    warpFence();
+    raft::warpFence();
 
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
@@ -159,7 +159,7 @@ struct KeyValueBlockSelect {
       warpV[i * raft::WarpSize + laneId].value = warpVRegisters[i].value;
     }
 
-    warpFence();
+    raft::warpFence();
   }
 
   /// WARNING: all threads in a warp must participate in this.
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh
index 98d387080..022e5eac5 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh
@@ -134,7 +134,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
   list_index[inlist_id] = source_ixs == nullptr ? i : source_ixs[i];
 
   // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
+  using interleaved_group = raft::Pow2<kIndexGroupSize>;
   auto group_offset       = interleaved_group::roundDown(inlist_id);
   auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
 
@@ -172,7 +172,7 @@ void extend(raft::resources const& handle,
   auto dim     = index->dim();
   list_spec<uint32_t, T, IdxT> list_device_spec{index->dim(),
                                                 index->conservative_memory_allocation()};
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim);
 
   RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
@@ -235,7 +235,7 @@ void extend(raft::resources const& handle,
                        lists[label],
                        list_device_spec,
                        new_list_sizes[label],
-                       Pow2<kIndexGroupSize>::roundUp(old_list_sizes[label]));
+                       raft::Pow2<kIndexGroupSize>::roundUp(old_list_sizes[label]));
     }
   }
   // Update the pointers and the sizes
@@ -305,7 +305,7 @@ inline auto build(raft::resources const& handle,
                   uint32_t dim) -> index<T, IdxT>
 {
   auto stream = resource::get_cuda_stream(handle);
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_flat::build(%zu, %u)", size_t(n_rows), dim);
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "unsupported data type");
@@ -379,7 +379,7 @@ inline void fill_refinement_index(raft::resources const& handle,
 
   auto stream      = resource::get_cuda_stream(handle);
   uint32_t n_lists = n_queries;
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));
 
   rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
@@ -456,7 +456,8 @@ void pack_list_data(
   raft::device_matrix_view<const T, uint32_t, raft::row_major> codes,
   uint32_t veclen,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
-  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major> list_data)
+  raft::device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+    list_data)
 {
   uint32_t n_rows = codes.extent(0);
   uint32_t dim    = codes.extent(1);
@@ -473,7 +474,7 @@ void pack_list_data(
 template <typename T, typename IdxT>
 void unpack_list_data(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+  raft::device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
     list_data,
   uint32_t veclen,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
index 0978c4c7f..221da924c 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
@@ -206,7 +206,8 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
         encV,
         reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
       uint32_t queryRegs[veclen_int];
-      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
+      raft::lds(queryRegs,
+                reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         compute_dist(dist, queryRegs[k], encV[k]);
@@ -234,7 +235,7 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          compute_dist(dist, shfl(queryReg, d + k, raft::WarpSize), encV[k]);
+          compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), encV[k]);
         }
       }
     }
@@ -255,7 +256,7 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
       raft::ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        uint32_t q = shfl(queryReg, (d / 4) + k, raft::WarpSize);
+        uint32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize);
         compute_dist(dist, q, enc[k]);
       }
     }
@@ -301,7 +302,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
         compute_dist(dist, q, encV);
       }
     }
@@ -318,7 +319,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
     uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
-      uint32_t q   = shfl(queryReg, d / veclen, raft::WarpSize);
+      uint32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
       compute_dist(dist, q, enc);
     }
   }
@@ -362,7 +363,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
         compute_dist(dist, q, encV);
       }
     }
@@ -379,7 +380,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
     uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
-      uint32_t q   = shfl(queryReg, d / veclen, raft::WarpSize);
+      uint32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
       compute_dist(dist, q, enc);
     }
   }
@@ -422,7 +423,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         uint32_t encV = data[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
         compute_dist(dist, q, encV);
       }
     }
@@ -439,7 +440,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
     uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       uint32_t enc = data[lane_id];
-      uint32_t q   = shfl(queryReg, d, raft::WarpSize);
+      uint32_t q   = raft::shfl(queryReg, d, raft::WarpSize);
       compute_dist(dist, q, enc);
     }
   }
@@ -519,7 +520,7 @@ struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
       raft::ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        int32_t q = shfl(queryReg, (d / 4) + k, raft::WarpSize);  // Here 4 is for 1 - int;
+        int32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize);  // Here 4 is for 1 - int;
         compute_dist(dist, q, enc[k]);
       }
     }
@@ -562,7 +563,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        int32_t q    = shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        int32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
         compute_dist(dist, q, encV);
       }
     }
@@ -576,7 +577,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
     int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
-      int32_t q   = shfl(queryReg, d / veclen, raft::WarpSize);
+      int32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
       compute_dist(dist, q, enc);
     }
   }
@@ -702,8 +703,8 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
   copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
   __syncthreads();
 
-  using block_sort_t = matrix::detail::select::warpsort::block_sort<
-    matrix::detail::select::warpsort::warp_sort_filtered,
+  using block_sort_t = raft::matrix::detail::select::warpsort::block_sort<
+    raft::matrix::detail::select::warpsort::warp_sort_filtered,
     Capacity,
     Ascending,
     float,
@@ -711,7 +712,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
   block_sort_t queue(k);
 
   {
-    using align_warp  = Pow2<raft::WarpSize>;
+    using align_warp  = raft::Pow2<raft::WarpSize>;
     const int lane_id = align_warp::mod(threadIdx.x);
 
     // How many full warps needed to compute the distance (without remainder)
@@ -758,7 +759,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
         }
 
         if (dim > query_smem_elems) {
-          // The default path - using shfl ops - for dimensions beyond query_smem_elems
+          // The default path - using raft::shfl ops - for dimensions beyond query_smem_elems
           loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
                                                                                   compute_dist);
           for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += raft::WarpSize) {
@@ -835,7 +836,7 @@ void launch_kernel(Lambda lambda,
 {
   RAFT_EXPECTS(Veclen == index.veclen(),
                "Configured Veclen does not match the index interleaving pattern.");
-  constexpr auto kKernel   = interleaved_scan_kernel<Capacity,
+  constexpr auto kKernel     = interleaved_scan_kernel<Capacity,
                                                    Veclen,
                                                    Ascending,
                                                    T,
@@ -844,9 +845,9 @@ void launch_kernel(Lambda lambda,
                                                    IvfSampleFilterT,
                                                    Lambda,
                                                    PostLambda>;
-  const int max_query_smem = 16384;
-  int query_smem_elems =
-    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * raft::WarpSize>::roundUp(index.dim()));
+  const int max_query_smem   = 16384;
+  int query_smem_elems       = std::min<int>(max_query_smem / sizeof(T),
+                                       raft::Pow2<Veclen * raft::WarpSize>::roundUp(index.dim()));
   int smem_size              = query_smem_elems * sizeof(T);
   constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
   auto block_merge_mem =
@@ -911,7 +912,7 @@ struct euclidean_dist<Veclen, uint8_t, uint32_t> {
   {
     if constexpr (Veclen > 1) {
       const auto diff = __vabsdiffu4(x, y);
-      acc             = dp4a(diff, diff, acc);
+      acc             = raft::dp4a(diff, diff, acc);
     } else {
       const auto diff = __usad(x, y, 0u);
       acc += diff * diff;
@@ -924,12 +925,12 @@ struct euclidean_dist<Veclen, int8_t, int32_t> {
   __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
   {
     if constexpr (Veclen > 1) {
-      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
-      // between two int8 numbers can be greater than 127 and therefore represented as a negative
-      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
-      // from uint8 to uint32 is correct.
+      // Note that we enforce here that the unsigned version of raft::dp4a is used, because the
+      // difference between two int8 numbers can be greater than 127 and therefore represented as a
+      // negative number in int8. Casting from int8 to int32 would yield incorrect results, while
+      // casting from uint8 to uint32 is correct.
       const auto diff = __vabsdiffs4(x, y);
-      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
+      acc             = raft::dp4a(diff, diff, static_cast<uint32_t>(acc));
     } else {
       const auto diff = x - y;
       acc += diff * diff;
@@ -1042,7 +1043,7 @@ struct select_interleaved_scan_kernel {
     RAFT_EXPECTS(capacity == Capacity,
                  "Capacity must be power-of-two not bigger than the maximum allowed size "
                  "matrix::detail::select::warpsort::kMaxCapacity (%d).",
-                 matrix::detail::select::warpsort::kMaxCapacity);
+                 raft::matrix::detail::select::warpsort::kMaxCapacity);
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh
index 39032a575..7f613963b 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh
@@ -27,7 +27,7 @@
 #include <raft/linalg/gemm.cuh>                   // raft::linalg::gemm
 #include <raft/linalg/norm.cuh>                   // raft::linalg::norm
 #include <raft/linalg/unary_op.cuh>               // raft::linalg::unary_op
-#include <raft/matrix/detail/select_k.cuh>        // matrix::detail::select_k
+#include <raft/matrix/detail/select_k.cuh>        // raft::matrix::detail::select_k
 #include <rmm/mr/device/per_device_resource.hpp>  // rmm::device_memory_resource
 
 namespace cuvs::neighbors::ivf_flat::detail {
@@ -127,16 +127,16 @@ void search_impl(raft::resources const& handle,
                stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  matrix::detail::select_k<AccT, uint32_t>(handle,
-                                           distance_buffer_dev.data(),
-                                           nullptr,
-                                           n_queries,
-                                           index.n_lists(),
-                                           n_probes,
-                                           coarse_distances_dev.data(),
-                                           coarse_indices_dev.data(),
-                                           select_min,
-                                           search_mr);
+  raft::matrix::detail::select_k<AccT, uint32_t>(handle,
+                                                 distance_buffer_dev.data(),
+                                                 nullptr,
+                                                 n_queries,
+                                                 index.n_lists(),
+                                                 n_probes,
+                                                 coarse_distances_dev.data(),
+                                                 coarse_indices_dev.data(),
+                                                 select_min,
+                                                 search_mr);
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
 
@@ -191,16 +191,16 @@ void search_impl(raft::resources const& handle,
 
   // Merge topk values from different blocks
   if (grid_dim_x > 1) {
-    matrix::detail::select_k<AccT, IdxT>(handle,
-                                         refined_distances_dev.data(),
-                                         refined_indices_dev.data(),
-                                         n_queries,
-                                         k * grid_dim_x,
-                                         k,
-                                         distances,
-                                         neighbors,
-                                         select_min,
-                                         search_mr);
+    raft::matrix::detail::select_k<AccT, IdxT>(handle,
+                                               refined_distances_dev.data(),
+                                               refined_indices_dev.data(),
+                                               n_queries,
+                                               k * grid_dim_x,
+                                               k,
+                                               distances,
+                                               neighbors,
+                                               select_min,
+                                               search_mr);
   }
 }
 
@@ -219,7 +219,7 @@ inline void search(raft::resources const& handle,
                    rmm::mr::device_memory_resource* mr = nullptr,
                    IvfSampleFilterT sample_filter      = IvfSampleFilterT())
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
 
   RAFT_EXPECTS(params.n_probes > 0,
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh
index 1f181b517..60d2392be 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh
@@ -86,7 +86,7 @@ void serialize(raft::resources const& handle, std::ostream& os, const index<T, I
                         os,
                         index_.lists()[label],
                         list_store_spec,
-                        Pow2<kIndexGroupSize>::roundUp(sizes_host(label)));
+                        raft::Pow2<kIndexGroupSize>::roundUp(sizes_host(label)));
   }
   resource::sync_stream(handle);
 }
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh
index de89ea2d9..c3d3152e5 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh
@@ -68,7 +68,7 @@ template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
 {
-  using warp    = Pow2<WarpSize>;
+  using warp    = raft::Pow2<WarpSize>;
   size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x));
   uint32_t i    = warp::mod(threadIdx.x);
   if (row_ix >= n_rows) return;
@@ -104,7 +104,7 @@ void copy_warped(T* out,
 {
   constexpr uint32_t kBlockDim = 128;
   dim3 threads(kBlockDim, 1, 1);
-  dim3 blocks(div_rounding_up_safe<size_t>(n_rows, kBlockDim / WarpSize), 1, 1);
+  dim3 blocks(div_rounding_up_safe<size_t>(n_rows, kBlockDim / raft::WarpSize), 1, 1);
   copy_warped_kernel<kBlockDim, T, S>
     <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
@@ -126,7 +126,7 @@ inline void make_rotation_matrix(raft::resources const& handle,
                                  float* rotation_matrix,
                                  raft::random::RngState rng = raft::random::RngState(7ULL))
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_pq::make_rotation_matrix(%u * %u)", n_rows, n_cols);
   auto stream  = resource::get_cuda_stream(handle);
   bool inplace = n_rows == n_cols;
@@ -178,8 +178,8 @@ void select_residuals(raft::resources const& handle,
 {
   auto stream = resource::get_cuda_stream(handle);
   rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
-  // Note: the number of rows of the input dataset isn't actually n_rows, but matrix::gather doesn't
-  // need to know it, any strictly positive number would work.
+  // Note: the number of rows of the input dataset isn't actually n_rows, but raft::matrix::gather
+  // doesn't need to know it, any strictly positive number would work.
   cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(
     dataset, utils::mapping<float>{});
   raft::matrix::gather(mapping_itr, (IdxT)dim, n_rows, row_ids, n_rows, tmp.data(), stream);
@@ -411,7 +411,7 @@ void train_per_subset(raft::resources const& handle,
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
   for (uint32_t j = 0; j < index.pq_dim(); j++) {
-    common::nvtx::range<common::nvtx::domain::raft> pq_per_subspace_scope(
+    raft::common::nvtx::range<raft::common::nvtx::domain::raft> pq_per_subspace_scope(
       "ivf_pq::build::per_subspace[%u]", j);
 
     // Get the rotated cluster centers for each training vector.
@@ -509,7 +509,7 @@ void train_per_cluster(raft::resources const& handle,
   for (uint32_t l = 0; l < index.n_lists(); l++) {
     auto cluster_size = cluster_sizes.data()[l];
     if (cluster_size == 0) continue;
-    common::nvtx::range<common::nvtx::domain::raft> pq_per_cluster_scope(
+    raft::common::nvtx::range<raft::common::nvtx::domain::raft> pq_per_cluster_scope(
       "ivf_pq::build::per_cluster[%u](size = %u)", l, cluster_size);
 
     select_residuals(handle,
@@ -569,8 +569,8 @@ void train_per_cluster(raft::resources const& handle,
 template <typename T, typename IdxT>
 static __device__ auto reinterpret_vectors(
   raft::device_matrix_view<T, IdxT, raft::row_major> vectors,
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers)
-  -> device_mdspan<T, extent_3d<IdxT>, raft::row_major>
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers)
+  -> raft::device_mdspan<T, extent_3d<IdxT>, raft::row_major>
 {
   const uint32_t pq_len = pq_centers.extent(1);
   const uint32_t pq_dim = vectors.extent(1) / pq_len;
@@ -608,7 +608,7 @@ struct unpack_codes {
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) RAFT_KERNEL unpack_list_data_kernel(
   raft::device_matrix_view<uint8_t, uint32_t, raft::row_major> out_codes,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     in_list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
@@ -628,7 +628,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL unpack_list_data_kernel(
  */
 inline void unpack_list_data(
   raft::device_matrix_view<uint8_t, uint32_t, raft::row_major> codes,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
   uint32_t pq_bits,
@@ -699,7 +699,7 @@ struct unpack_contiguous {
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) RAFT_KERNEL unpack_contiguous_list_data_kernel(
   uint8_t* out_codes,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     in_list_data,
   uint32_t n_rows,
   uint32_t pq_dim,
@@ -720,7 +720,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL unpack_contiguous_list_data_kernel(
  */
 inline void unpack_contiguous_list_data(
   uint8_t* codes,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     list_data,
   uint32_t n_rows,
   uint32_t pq_dim,
@@ -771,9 +771,9 @@ struct reconstruct_vectors {
   codebook_gen codebook_kind;
   uint32_t cluster_ix;
   uint32_t pq_len;
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers;
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> centers_rot;
-  device_mdspan<float, extent_3d<uint32_t>, raft::row_major> out_vectors;
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers;
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> centers_rot;
+  raft::device_mdspan<float, extent_3d<uint32_t>, raft::row_major> out_vectors;
 
   /**
    * Create a callable to be passed to `run_on_list`.
@@ -786,7 +786,7 @@ struct reconstruct_vectors {
    */
   __device__ inline reconstruct_vectors(
     raft::device_matrix_view<float, uint32_t, raft::row_major> out_vectors,
-    device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
+    raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
     raft::device_matrix_view<const float, uint32_t, raft::row_major> centers_rot,
     codebook_gen codebook_kind,
     uint32_t cluster_ix)
@@ -824,9 +824,9 @@ struct reconstruct_vectors {
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) RAFT_KERNEL reconstruct_list_data_kernel(
   raft::device_matrix_view<float, uint32_t, raft::row_major> out_vectors,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     in_list_data,
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
   raft::device_matrix_view<const float, uint32_t, raft::row_major> centers_rot,
   codebook_gen codebook_kind,
   uint32_t cluster_ix,
@@ -943,7 +943,8 @@ struct pass_codes {
 
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) RAFT_KERNEL pack_list_data_kernel(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data,
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data,
   raft::device_matrix_view<const uint8_t, uint32_t, raft::row_major> codes,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
@@ -963,7 +964,8 @@ __launch_bounds__(BlockSize) RAFT_KERNEL pack_list_data_kernel(
  * @param[in] stream
  */
 inline void pack_list_data(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data,
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data,
   raft::device_matrix_view<const uint8_t, uint32_t, raft::row_major> codes,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
   uint32_t pq_bits,
@@ -1032,7 +1034,8 @@ struct pack_contiguous {
 
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) RAFT_KERNEL pack_contiguous_list_data_kernel(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data,
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data,
   const uint8_t* codes,
   uint32_t n_rows,
   uint32_t pq_dim,
@@ -1054,7 +1057,8 @@ __launch_bounds__(BlockSize) RAFT_KERNEL pack_contiguous_list_data_kernel(
  * @param[in] stream
  */
 inline void pack_contiguous_list_data(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data,
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data,
   const uint8_t* codes,
   uint32_t n_rows,
   uint32_t pq_dim,
@@ -1106,7 +1110,7 @@ void pack_contiguous_list_data(raft::resources const& res,
  *
  * @tparam SubWarpSize
  *   how many threads work on a single vector;
- *   bounded by either WarpSize or pq_book_size.
+ *   bounded by either raft::WarpSize or pq_book_size.
  *
  * @param pq_centers
  *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
@@ -1124,11 +1128,11 @@ template <uint32_t SubWarpSize, typename IdxT>
 struct encode_vectors {
   codebook_gen codebook_kind;
   uint32_t cluster_ix;
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers;
-  device_mdspan<const float, extent_3d<IdxT>, raft::row_major> in_vectors;
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers;
+  raft::device_mdspan<const float, extent_3d<IdxT>, raft::row_major> in_vectors;
 
   __device__ inline encode_vectors(
-    device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
+    raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
     raft::device_matrix_view<const float, IdxT, raft::row_major> in_vectors,
     codebook_gen codebook_kind,
     uint32_t cluster_ix)
@@ -1145,7 +1149,7 @@ struct encode_vectors {
    */
   __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t
   {
-    uint32_t lane_id = Pow2<SubWarpSize>::mod(laneId());
+    uint32_t lane_id = raft::Pow2<SubWarpSize>::mod(laneId());
     uint32_t partition_ix;
     switch (codebook_kind) {
       case codebook_gen::PER_CLUSTER: {
@@ -1196,11 +1200,11 @@ __launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel(
   raft::device_vector_view<uint32_t, uint32_t, raft::row_major> list_sizes,
   raft::device_vector_view<IdxT*, uint32_t, raft::row_major> inds_ptrs,
   raft::device_vector_view<uint8_t*, uint32_t, raft::row_major> data_ptrs,
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
   codebook_gen codebook_kind)
 {
   constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
-  using subwarp_align             = Pow2<kSubWarpSize>;
+  using subwarp_align             = raft::Pow2<kSubWarpSize>;
   const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
   const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
   if (row_ix >= new_vectors.extent(0)) { return; }
@@ -1208,7 +1212,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel(
   const uint32_t cluster_ix = new_labels[row_ix];
   uint32_t out_ix;
   if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
-  out_ix = shfl(out_ix, 0, kSubWarpSize);
+  out_ix = raft::shfl(out_ix, 0, kSubWarpSize);
 
   // write the label  (one record per subwarp)
   auto pq_indices = inds_ptrs(cluster_ix);
@@ -1235,9 +1239,10 @@ __launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel(
 
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) RAFT_KERNEL encode_list_data_kernel(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data,
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data,
   raft::device_matrix_view<const float, uint32_t, raft::row_major> new_vectors,
-  device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
+  raft::device_mdspan<const float, extent_3d<uint32_t>, raft::row_major> pq_centers,
   codebook_gen codebook_kind,
   uint32_t cluster_ix,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
@@ -1564,7 +1569,7 @@ void extend(raft::resources const& handle,
             const IdxT* new_indices,
             IdxT n_rows)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim());
 
   resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend");
@@ -1764,7 +1769,7 @@ auto build(raft::resources const& handle,
            IdxT n_rows,
            uint32_t dim) -> index<IdxT>
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_pq::build(%zu, %u)", size_t(n_rows), dim);
   resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build");
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh
index 3ae826424..bbd47baa0 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh
@@ -31,7 +31,7 @@
 namespace cuvs::neighbors::ivf_pq::detail {
 
 /** A chunk of PQ-encoded vector managed by one CUDA thread. */
-using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+using pq_vec_t = raft::TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
 
 /**
  * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
@@ -81,7 +81,8 @@ struct bitfield_view_t {
   constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
   {
     uint32_t bit_offset = i * Bits;
-    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
+    return bitfield_ref_t<Bits>{raw + raft::Pow2<8>::div(bit_offset),
+                                raft::Pow2<8>::mod(bit_offset)};
   }
 };
 
@@ -100,14 +101,14 @@ struct bitfield_view_t {
  */
 template <uint32_t PqBits, typename Action>
 __device__ void run_on_vector(
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     in_list_data,
   uint32_t in_ix,
   uint32_t out_ix,
   uint32_t pq_dim,
   Action action)
 {
-  using group_align         = Pow2<kIndexGroupSize>;
+  using group_align         = raft::Pow2<kIndexGroupSize>;
   const uint32_t group_ix   = group_align::div(in_ix);
   const uint32_t ingroup_ix = group_align::mod(in_ix);
 
@@ -143,16 +144,16 @@ __device__ void run_on_vector(
  */
 template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
 __device__ void write_vector(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     out_list_data,
   uint32_t out_ix,
   IdxT in_ix,
   uint32_t pq_dim,
   Action action)
 {
-  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
+  const uint32_t lane_id = raft::Pow2<SubWarpSize>::mod(threadIdx.x);
 
-  using group_align         = Pow2<kIndexGroupSize>;
+  using group_align         = raft::Pow2<kIndexGroupSize>;
   const uint32_t group_ix   = group_align::div(out_ix);
   const uint32_t ingroup_ix = group_align::mod(out_ix);
 
@@ -179,7 +180,7 @@ __device__ void write_vector(
 /** Process the given indices or a block of a single list (cluster). */
 template <uint32_t PqBits, typename Action>
 __device__ void run_on_list(
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     in_list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
   uint32_t len,
@@ -197,14 +198,14 @@ __device__ void run_on_list(
 /** Process the given indices or a block of a single list (cluster). */
 template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
 __device__ void write_list(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     out_list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
   uint32_t len,
   uint32_t pq_dim,
   Action action)
 {
-  using subwarp_align = Pow2<SubWarpSize>;
+  using subwarp_align = raft::Pow2<SubWarpSize>;
   uint32_t stride     = subwarp_align::div(blockDim.x);
   uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
   for (; ix < len; ix += stride) {
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
index 7526a8052..c5c1be45c 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -20,7 +20,7 @@
 #include <cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh>  // dummy_block_sort_t
 #include <cuvs/neighbors/ivf_pq_types.hpp>                    // codebook_gen
 #include <cuvs/neighbors/sample_filter_types.hpp>             // none_ivf_sample_filter
-#include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
+#include <raft/matrix/detail/select_warpsort.cuh>  // raft::matrix::detail::select::warpsort::warp_sort_distributed
 #include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
 #include <raft/util/device_atomics.cuh>      // raft::atomicMin
 #include <raft/util/pow2_utils.cuh>          // raft::Pow2
@@ -37,7 +37,7 @@ namespace cuvs::neighbors::ivf_pq::detail {
  */
 static constexpr int kMaxCapacity = 128;
 static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
-              "kMaxCapacity must be a power of two, not smaller than the WarpSize.");
+              "kMaxCapacity must be a power of two, not smaller than the raft::WarpSize.");
 
 // using weak attribute here, because it may be compiled multiple times.
 auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries)
@@ -50,8 +50,8 @@ auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, ui
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = matrix::detail::select::warpsort::block_sort<
-    matrix::detail::select::warpsort::warp_sort_distributed_ext,
+  using type = raft::matrix::detail::select::warpsort::block_sort<
+    raft::matrix::detail::select::warpsort::warp_sort_distributed_ext,
     Capacity,
     true,
     T,
@@ -104,7 +104,7 @@ constexpr inline auto estimate_carveout(double shmem_fraction,
                                         size_t shmem_per_block,
                                         const cudaDeviceProp& dev_props) -> int
 {
-  using shmem_unit = Pow2<128>;
+  using shmem_unit = raft::Pow2<128>;
   size_t m         = shmem_unit::roundUp(shmem_per_block);
   size_t r         = dev_props.reservedSharedMemPerBlock;
   size_t s         = dev_props.sharedMemPerMultiprocessor;
@@ -437,11 +437,11 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
     // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
     // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
     // so that the warp can achieve the best coalesced read throughput.
-    using group_align  = Pow2<kIndexGroupSize>;
-    using vec_align    = Pow2<kIndexGroupVecLen>;
+    using group_align  = raft::Pow2<kIndexGroupSize>;
+    using vec_align    = raft::Pow2<kIndexGroupVecLen>;
     using local_topk_t = block_sort_t<Capacity, OutT, uint32_t>;
     using op_t         = uint32_t;
-    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
+    using vec_t        = raft::TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
 
     uint32_t sample_offset = 0;
     if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
@@ -453,7 +453,7 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
                           group_align::mod(threadIdx.x) * vec_align::Value;
     pq_line_width *= blockDim.x;
 
-    constexpr OutT kDummy = upper_bound<OutT>();
+    constexpr OutT kDummy = raft::upper_bound<OutT>();
     OutT query_kth        = kDummy;
     if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
     OutT early_stop_limit = kDummy;
@@ -585,7 +585,7 @@ auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max)
 /** Estimate the occupancy for the given kernel on the given device. */
 template <typename OutT, typename LutT, typename IvfSampleFilterT>
 struct occupancy_t {
-  using shmem_unit = Pow2<128>;
+  using shmem_unit = raft::Pow2<128>;
 
   int blocks_per_sm = 0;
   double occupancy  = 0.0;
@@ -725,7 +725,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
     ltk_reduce_mem_t(bool manage_local_topk, uint32_t topk)
       : manage_local_topk(manage_local_topk), topk(topk)
     {
-      subwarp_size = WarpSize;
+      subwarp_size = raft::WarpSize;
       while (topk * 2 <= subwarp_size) {
         subwarp_size /= 2;
       }
@@ -733,11 +733,10 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
 
     [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
     {
-      return manage_local_topk
-               ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide<OutT,
-                                                                                          uint32_t>(
-                   n_threads / subwarp_size, topk)
-               : 0;
+      return manage_local_topk ? raft::matrix::detail::select::warpsort::
+                                   template calc_smem_size_for_block_wide<OutT, uint32_t>(
+                                     n_threads / subwarp_size, topk)
+                               : 0;
     }
   } ltk_reduce_mem{manage_local_topk, topk};
 
@@ -760,7 +759,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
   //   1. It's a power-of-two for efficient L1 caching of pq_centers values
   //      (multiples of `1 << pq_bits`).
   //   2. It should be large enough to fully utilize an SM.
-  uint32_t n_threads_min = WarpSize;
+  uint32_t n_threads_min = raft::WarpSize;
   while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
          dev_props.maxThreadsPerMultiProcessor) {
     n_threads_min *= 2;
@@ -782,7 +781,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
 
   // Granularity of changing the number of threads when computing the maximum block size.
   // It's good to have it multiple of the PQ book width.
-  uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
+  uint32_t n_threads_gty = raft::round_up_safe<uint32_t>(1u << pq_bits, raft::WarpSize);
 
   /*
    Shared memory / L1 cache balance is the main limiter of this kernel.
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh
index 34273af26..8732aed3e 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
+#include <raft/matrix/detail/select_warpsort.cuh>  // raft::matrix::detail::select::warpsort::warp_sort_distributed
 
 /*
  * This header file is a bit of an ugly duckling. The type dummy_block_sort is
@@ -31,7 +31,8 @@ namespace cuvs::neighbors::ivf_pq::detail {
 
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  using queue_t =
+    raft::matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
   template <typename... Args>
   __device__ dummy_block_sort_t(int k, Args...){};
 };
diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh
index dbaf36adf..fa6f64c7b 100644
--- a/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh
@@ -153,16 +153,16 @@ void select_clusters(raft::resources const& handle,
 
   // Select neighbor clusters for each query.
   rmm::device_uvector<float> cluster_dists(n_queries * n_probes, stream, mr);
-  matrix::detail::select_k<float, uint32_t>(handle,
-                                            qc_distances.data(),
-                                            nullptr,
-                                            n_queries,
-                                            n_lists,
-                                            n_probes,
-                                            cluster_dists.data(),
-                                            clusters_to_probe,
-                                            true,
-                                            mr);
+  raft::matrix::detail::select_k<float, uint32_t>(handle,
+                                                  qc_distances.data(),
+                                                  nullptr,
+                                                  n_queries,
+                                                  n_lists,
+                                                  n_probes,
+                                                  cluster_dists.data(),
+                                                  clusters_to_probe,
+                                                  true,
+                                                  mr);
 }
 
 /**
@@ -187,7 +187,7 @@ __launch_bounds__(BlockDim) RAFT_KERNEL
   chunk_indices += n_probes * blockIdx.x;
 
   // block scan
-  const uint32_t n_probes_aligned = Pow2<BlockDim>::roundUp(n_probes);
+  const uint32_t n_probes_aligned = raft::Pow2<BlockDim>::roundUp(n_probes);
   uint32_t total                  = 0;
   for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) {
     auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u;
@@ -230,7 +230,7 @@ struct calc_chunk_indices {
   template <int BlockDim>
   static auto try_block_dim(uint32_t n_probes, uint32_t n_queries) -> configured
   {
-    if constexpr (BlockDim >= WarpSize * 2) {
+    if constexpr (BlockDim >= raft::WarpSize * 2) {
       if (BlockDim >= n_probes * 2) { return try_block_dim<(BlockDim / 2)>(n_probes, n_queries); }
     }
     return {reinterpret_cast<void*>(calc_chunk_indices_kernel<BlockDim>),
@@ -584,16 +584,16 @@ void ivfpq_search_worker(raft::resources const& handle,
 
   // Select topk vectors for each query
   rmm::device_uvector<ScoreT> topk_dists(n_queries * topK, stream, mr);
-  matrix::detail::select_k<ScoreT, uint32_t>(handle,
-                                             distances_buf.data(),
-                                             neighbors_ptr,
-                                             n_queries,
-                                             topk_len,
-                                             topK,
-                                             topk_dists.data(),
-                                             neighbors_uint32,
-                                             true,
-                                             mr);
+  raft::matrix::detail::select_k<ScoreT, uint32_t>(handle,
+                                                   distances_buf.data(),
+                                                   neighbors_ptr,
+                                                   n_queries,
+                                                   topk_len,
+                                                   topK,
+                                                   topk_dists.data(),
+                                                   neighbors_uint32,
+                                                   true,
+                                                   mr);
 
   // Postprocessing
   postprocess_distances(
@@ -741,7 +741,7 @@ inline void search(raft::resources const& handle,
 {
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported element type.");
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "ivf_pq::search(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
     n_queries,
     params.n_probes,
@@ -778,7 +778,7 @@ inline void search(raft::resources const& handle,
 
   uint32_t max_samples = 0;
   {
-    IdxT ms = Pow2<128>::roundUp(index.accum_sorted_sizes()(n_probes));
+    IdxT ms = raft::Pow2<128>::roundUp(index.accum_sorted_sizes()(n_probes));
     RAFT_EXPECTS(ms <= IdxT(std::numeric_limits<uint32_t>::max()),
                  "The maximum sample size is too big.");
     max_samples = ms;
diff --git a/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh b/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh
index 555f7b258..00610c45e 100644
--- a/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh
+++ b/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh
@@ -41,7 +41,7 @@ RAFT_KERNEL knn_merge_parts_kernel(const value_t* inK,
                                    int k,
                                    value_idx* translations)
 {
-  constexpr int kNumWarps = tpb / WarpSize;
+  constexpr int kNumWarps = tpb / raft::WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
diff --git a/cpp/include/cuvs/neighbors/detail/refine_device.cuh b/cpp/include/cuvs/neighbors/detail/refine_device.cuh
index 61d7a6ecc..5bc478702 100644
--- a/cpp/include/cuvs/neighbors/detail/refine_device.cuh
+++ b/cpp/include/cuvs/neighbors/detail/refine_device.cuh
@@ -55,7 +55,7 @@ void refine_device(
                "k must be lest than topk::kMaxCapacity (%d).",
                raft::matrix::detail::select::warpsort::kMaxCapacity);
 
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "neighbors::refine(%zu, %u)", size_t(n_queries), uint32_t(n_candidates));
 
   refine_check_input(dataset.extents(),
@@ -74,7 +74,7 @@ void refine_device(
   // - We run IVF flat search with n_probes=1 to select the best k elements of the candidates.
   rmm::device_uvector<uint32_t> fake_coarse_idx(n_queries, resource::get_cuda_stream(handle));
 
-  thrust::sequence(resource::get_thrust_policy(handle),
+  thrust::sequence(raft::resource::get_thrust_policy(handle),
                    fake_coarse_idx.data(),
                    fake_coarse_idx.data() + n_queries);
 
diff --git a/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp b/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp
index ec830e58c..c753e56f7 100644
--- a/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp
+++ b/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp
@@ -39,7 +39,7 @@ template <typename DC, typename IdxT, typename DataT, typename DistanceT, typena
   size_t orig_k    = neighbor_candidates.extent(1);
   size_t refined_k = indices.extent(1);
 
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k);
 
   auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads()));
diff --git a/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh b/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh
index acc931e4a..f10339485 100644
--- a/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh
+++ b/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh
@@ -35,7 +35,7 @@ RAFT_KERNEL select_k_kernel(const key_t* inK,
                             payload_t initV,
                             int k)
 {
-  using align_warp        = Pow2<WarpSize>;
+  using align_warp        = raft::Pow2<WarpSize>;
   constexpr int kNumWarps = align_warp::div(tpb);
 
   __shared__ key_t smemK[kNumWarps * warp_q];
@@ -93,7 +93,7 @@ inline void select_k_impl(const key_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
+  auto kInit = select_min ? raft::upper_bound<key_t>() : lower_bound<key_t>();
   auto vInit = -1;
   if (select_min) {
     select_k_kernel<payload_t, key_t, false, warp_q, thread_q, n_threads>
diff --git a/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh b/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh
index 25b84ec55..cca83cea0 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh
@@ -64,7 +64,8 @@ void pack(
   raft::device_matrix_view<const T, uint32_t, raft::row_major> codes,
   uint32_t veclen,
   uint32_t offset,
-  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major> list_data)
+  raft::device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+    list_data)
 {
   cuvs::neighbors::ivf_flat::detail::pack_list_data<T, IdxT>(res, codes, veclen, offset, list_data);
 }
@@ -100,7 +101,7 @@ void pack(
 template <typename T, typename IdxT>
 void unpack(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+  raft::device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
     list_data,
   uint32_t veclen,
   uint32_t offset,
diff --git a/cpp/include/cuvs/neighbors/ivf_flat_types.hpp b/cpp/include/cuvs/neighbors/ivf_flat_types.hpp
index 4863805fa..28023f474 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_flat_types.hpp
@@ -323,7 +323,7 @@ struct index : ann::index {
       copy(&this_inds_ptrs(label), &inds_ptr, 1, stream);
     }
     auto this_list_sizes = list_sizes().data_handle();
-    total_size_          = thrust::reduce(resource::get_thrust_policy(res),
+    total_size_          = thrust::reduce(raft::resource::get_thrust_policy(res),
                                  this_list_sizes,
                                  this_list_sizes + this_lists.size(),
                                  0,
diff --git a/cpp/include/cuvs/neighbors/ivf_list.hpp b/cpp/include/cuvs/neighbors/ivf_list.hpp
index 316403893..c395980de 100644
--- a/cpp/include/cuvs/neighbors/ivf_list.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_list.hpp
@@ -45,7 +45,7 @@ list<SpecT, SizeT, SpecExtraArgs...>::list(raft::resources const& res,
                                            size_type n_rows)
   : size{n_rows}, data{res}, indices{res}
 {
-  auto capacity = round_up_safe<SizeT>(n_rows, spec.align_max);
+  auto capacity = raft::round_up_safe<SizeT>(n_rows, spec.align_max);
   if (n_rows < spec.align_max) {
     capacity = bound_by_power_of_two<SizeT>(std::max<SizeT>(n_rows, spec.align_min));
     capacity = std::min<SizeT>(capacity, spec.align_max);
@@ -63,7 +63,7 @@ list<SpecT, SizeT, SpecExtraArgs...>::list(raft::resources const& res,
       e.what());
   }
   // Fill the index buffer with a pre-defined marker for easier debugging
-  thrust::fill_n(resource::get_thrust_policy(res),
+  thrust::fill_n(raft::resource::get_thrust_policy(res),
                  indices.data_handle(),
                  indices.size(),
                  ivf::kInvalidRecord<index_type>);
diff --git a/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh b/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh
index 29c9f164b..f02148101 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh
+++ b/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh
@@ -66,7 +66,7 @@ namespace codepacker {
  */
 inline void unpack(
   raft::resources const& res,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     list_data,
   uint32_t pq_bits,
   uint32_t offset,
@@ -112,7 +112,7 @@ inline void unpack(
  */
 inline void unpack_contiguous(
   raft::resources const& res,
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+  raft::device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
     list_data,
   uint32_t pq_bits,
   uint32_t offset,
@@ -151,7 +151,8 @@ inline void pack(
   raft::device_matrix_view<const uint8_t, uint32_t, raft::row_major> codes,
   uint32_t pq_bits,
   uint32_t offset,
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data)
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data)
 {
   ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, resource::get_cuda_stream(res));
 }
@@ -191,7 +192,8 @@ inline void pack_contiguous(
   uint32_t pq_dim,
   uint32_t pq_bits,
   uint32_t offset,
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major> list_data)
+  raft::device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, raft::row_major>
+    list_data)
 {
   ivf_pq::detail::pack_contiguous_list_data(
     list_data, codes, n_rows, pq_dim, offset, pq_bits, resource::get_cuda_stream(res));
diff --git a/cpp/include/cuvs/neighbors/ivf_pq_types.hpp b/cpp/include/cuvs/neighbors/ivf_pq_types.hpp
index e89e11712..16a904fcc 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_pq_types.hpp
@@ -175,7 +175,7 @@ struct list_spec {
    *    ].
    */
   using list_extents =
-    extents<SizeT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
+    extents<SizeT, raft::dynamic_extent, raft::dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
 
   SizeT align_max;
   SizeT align_min;
@@ -374,20 +374,21 @@ struct index : ann::index {
   {
   }
 
-  using pq_centers_extents =
-    std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent>;
+  using pq_centers_extents = std::experimental::
+    extents<uint32_t, raft::dynamic_extent, raft::dynamic_extent, raft::dynamic_extent>;
   /**
    * PQ cluster centers
    *
    *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
    *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
    */
-  inline auto pq_centers() noexcept -> device_mdspan<float, pq_centers_extents, raft::row_major>
+  inline auto pq_centers() noexcept
+    -> raft::device_mdspan<float, pq_centers_extents, raft::row_major>
   {
     return pq_centers_.view();
   }
   [[nodiscard]] inline auto pq_centers() const noexcept
-    -> device_mdspan<const float, pq_centers_extents, raft::row_major>
+    -> raft::device_mdspan<const float, pq_centers_extents, raft::row_major>
   {
     return pq_centers_.view();
   }
@@ -445,12 +446,13 @@ struct index : ann::index {
    *
    * This span is used during search to estimate the maximum size of the workspace.
    */
-  inline auto accum_sorted_sizes() noexcept -> host_vector_view<IdxT, uint32_t, raft::row_major>
+  inline auto accum_sorted_sizes() noexcept
+    -> raft::host_vector_view<IdxT, uint32_t, raft::row_major>
   {
     return accum_sorted_sizes_.view();
   }
   [[nodiscard]] inline auto accum_sorted_sizes() const noexcept
-    -> host_vector_view<const IdxT, uint32_t, raft::row_major>
+    -> raft::host_vector_view<const IdxT, uint32_t, raft::row_major>
   {
     return accum_sorted_sizes_.view();
   }
@@ -523,7 +525,7 @@ struct index : ann::index {
   // Primary data members
   std::vector<std::shared_ptr<list_data<IdxT>>> lists_;
   raft::device_vector<uint32_t, uint32_t, raft::row_major> list_sizes_;
-  device_mdarray<float, pq_centers_extents, raft::row_major> pq_centers_;
+  raft::device_mdarray<float, pq_centers_extents, raft::row_major> pq_centers_;
   raft::device_matrix<float, uint32_t, raft::row_major> centers_;
   raft::device_matrix<float, uint32_t, raft::row_major> centers_rot_;
   raft::device_matrix<float, uint32_t, raft::row_major> rotation_matrix_;
@@ -531,7 +533,7 @@ struct index : ann::index {
   // Computed members for accelerating search.
   raft::device_vector<uint8_t*, uint32_t, raft::row_major> data_ptrs_;
   raft::device_vector<IdxT*, uint32_t, raft::row_major> inds_ptrs_;
-  host_vector<IdxT, uint32_t, raft::row_major> accum_sorted_sizes_;
+  raft::host_vector<IdxT, uint32_t, raft::row_major> accum_sorted_sizes_;
 
   /** Throw an error if the index content is inconsistent. */
   void check_consistency()
diff --git a/cpp/include/cuvs/spatial/knn/ann.cuh b/cpp/include/cuvs/spatial/knn/ann.cuh
index e19eba6a4..99f5f12eb 100644
--- a/cpp/include/cuvs/spatial/knn/ann.cuh
+++ b/cpp/include/cuvs/spatial/knn/ann.cuh
@@ -47,7 +47,7 @@ approx_knn_build_index(raft::resources& handle,
                        value_idx n,
                        value_idx D)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "legacy approx_knn_build_index(n_rows = %u, dim = %u)", n, D);
   detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
 }
@@ -75,7 +75,7 @@ approx_knn_search(raft::resources& handle,
                   T* query_array,
                   value_idx n)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
     "legacy approx_knn_search(k = %u, n_queries = %u)", k, n);
   detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
 }
diff --git a/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh b/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh
index 1291b160c..f467600dd 100644
--- a/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh
@@ -72,17 +72,17 @@ void sample_landmarks(raft::resources const& handle,
   rmm::device_uvector<value_t> R_1nn_ones(index.m, resource::get_cuda_stream(handle));
   rmm::device_uvector<value_idx> R_indices(index.n_landmarks, resource::get_cuda_stream(handle));
 
-  thrust::sequence(resource::get_thrust_policy(handle),
+  thrust::sequence(raft::resource::get_thrust_policy(handle),
                    index.get_R_1nn_cols().data_handle(),
                    index.get_R_1nn_cols().data_handle() + index.m,
                    (value_idx)0);
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_1nn_ones.data(),
                R_1nn_ones.data() + R_1nn_ones.size(),
                1.0);
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_indices.data(),
                R_indices.data() + R_indices.size(),
                0.0);
@@ -131,7 +131,7 @@ void construct_landmark_1nn(raft::resources const& handle,
 {
   rmm::device_uvector<value_idx> R_1nn_inds(index.m, resource::get_cuda_stream(handle));
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_1nn_inds.data(),
                R_1nn_inds.data() + index.m,
                std::numeric_limits<value_idx>::max());
@@ -150,7 +150,7 @@ void construct_landmark_1nn(raft::resources const& handle,
     thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists().data_handle()));
 
   // group neighborhoods for each reference landmark and sort each group by distance
-  thrust::sort_by_key(resource::get_thrust_policy(handle),
+  thrust::sort_by_key(raft::resource::get_thrust_policy(handle),
                       keys,
                       keys + index.m,
                       index.get_R_1nn_cols().data_handle(),
@@ -214,7 +214,7 @@ void compute_landmark_radii(raft::resources const& handle,
   const value_idx* R_indptr_ptr  = index.get_R_indptr().data_handle();
   const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists().data_handle();
   value_t* R_radius_ptr          = index.get_R_radius().data_handle();
-  thrust::for_each(resource::get_thrust_policy(handle),
+  thrust::for_each(raft::resource::get_thrust_policy(handle),
                    entries,
                    entries + index.n_landmarks,
                    [=] __device__(value_idx input) {
@@ -253,11 +253,11 @@ void perform_rbc_query(raft::resources const& handle,
                        bool perform_post_filtering = true)
 {
   // initialize output inds and dists
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                inds,
                inds + (k * n_query_pts),
                std::numeric_limits<value_idx>::max());
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                dists,
                dists + (k * n_query_pts),
                std::numeric_limits<value_t>::max());
@@ -349,11 +349,11 @@ void rbc_build_index(raft::resources const& handle,
   rmm::device_uvector<value_idx> R_knn_inds(index.m, resource::get_cuda_stream(handle));
 
   // Initialize the uvectors
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_knn_inds.begin(),
                R_knn_inds.end(),
                std::numeric_limits<value_idx>::max());
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                index.get_R_closest_landmark_dists().data_handle(),
                index.get_R_closest_landmark_dists().data_handle() + index.m,
                std::numeric_limits<value_t>::max());
@@ -416,20 +416,20 @@ void rbc_all_knn_query(raft::resources const& handle,
   rmm::device_uvector<value_t> R_knn_dists(k * index.m, raft::resource::get_cuda_stream(handle));
 
   // Initialize the uvectors
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_knn_inds.begin(),
                R_knn_inds.end(),
                std::numeric_limits<value_idx>::max());
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_knn_dists.begin(),
                R_knn_dists.end(),
                std::numeric_limits<value_t>::max());
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                inds,
                inds + (k * index.m),
                std::numeric_limits<value_idx>::max());
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                dists,
                dists + (k * index.m),
                std::numeric_limits<value_t>::max());
@@ -494,20 +494,20 @@ void rbc_knn_query(raft::resources const& handle,
                                            raft::resource::get_cuda_stream(handle));
 
   // Initialize the uvectors
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_knn_inds.begin(),
                R_knn_inds.end(),
                std::numeric_limits<value_idx>::max());
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                R_knn_dists.begin(),
                R_knn_dists.end(),
                std::numeric_limits<value_t>::max());
 
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                inds,
                inds + (k * n_query_pts),
                std::numeric_limits<value_idx>::max());
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                dists,
                dists + (k * n_query_pts),
                std::numeric_limits<value_t>::max());
@@ -518,11 +518,11 @@ void rbc_knn_query(raft::resources const& handle,
   rmm::device_uvector<value_int> dists_counter(index.m, raft::resource::get_cuda_stream(handle));
   rmm::device_uvector<value_int> post_dists_counter(index.m,
                                                     raft::resource::get_cuda_stream(handle));
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                post_dists_counter.data(),
                post_dists_counter.data() + post_dists_counter.size(),
                0);
-  thrust::fill(resource::get_thrust_policy(handle),
+  thrust::fill(raft::resource::get_thrust_policy(handle),
                dists_counter.data(),
                dists_counter.data() + dists_counter.size(),
                0);
diff --git a/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh b/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh
index 95a70ba30..f1cb45e97 100644
--- a/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh
+++ b/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh
@@ -171,7 +171,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_index,
                                           dist_func dfunc,
                                           value_int* dist_counter)
 {
-  static constexpr int kNumWarps = tpb / WarpSize;
+  static constexpr int kNumWarps = tpb / raft::WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
   __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
@@ -191,7 +191,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_index,
     shared_memV,
     k);
 
-  const value_int n_k = Pow2<WarpSize>::roundDown(k);
+  const value_int n_k = raft::Pow2<WarpSize>::roundDown(k);
   value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
@@ -218,7 +218,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_index,
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit = Pow2<WarpSize>::roundDown(R_size);
+      const value_int limit = raft::Pow2<WarpSize>::roundDown(R_size);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
@@ -328,7 +328,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_index,
                                        distance_func dfunc,
                                        float weight = 1.0)
 {
-  static constexpr value_int kNumWarps = tpb / WarpSize;
+  static constexpr value_int kNumWarps = tpb / raft::WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
   __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
@@ -379,7 +379,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_index,
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit = Pow2<WarpSize>::roundDown(R_size);
+    value_int limit = raft::Pow2<WarpSize>::roundDown(R_size);
     value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
diff --git a/cpp/include/cuvs/spatial/knn/knn.cuh b/cpp/include/cuvs/spatial/knn/knn.cuh
index f73268d5b..f6267feb5 100644
--- a/cpp/include/cuvs/spatial/knn/knn.cuh
+++ b/cpp/include/cuvs/spatial/knn/knn.cuh
@@ -134,12 +134,13 @@ template <typename idx_t = int, typename value_t = float>
   cudaStream_t stream,
   SelectKAlgo algo = SelectKAlgo::FAISS)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
-                                                            select_min ? "min" : "max",
-                                                            k,
-                                                            n_inputs,
-                                                            input_len,
-                                                            int(algo));
+  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+    "select-%s-%d (%zu, %zu) algo-%d",
+    select_min ? "min" : "max",
+    k,
+    n_inputs,
+    input_len,
+    int(algo));
   ASSERT(size_t(input_len) >= size_t(k),
          "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
          size_t(input_len),
@@ -152,17 +153,17 @@ template <typename idx_t = int, typename value_t = float>
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      matrix::detail::select::radix::select_k<value_t, idx_t, 8, 512>(
+      raft::matrix::detail::select::radix::select_k<value_t, idx_t, 8, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      matrix::detail::select::radix::select_k<value_t, idx_t, 11, 512>(
+      raft::matrix::detail::select::radix::select_k<value_t, idx_t, 11, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:
-      matrix::detail::select::warpsort::select_k<value_t, idx_t>(
+      raft::matrix::detail::select::warpsort::select_k<value_t, idx_t>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
diff --git a/cpp/include/cuvs/spectral/cluster_solvers.cuh b/cpp/include/cuvs/spectral/cluster_solvers.cuh
index 760f55491..63859adb1 100644
--- a/cpp/include/cuvs/spectral/cluster_solvers.cuh
+++ b/cpp/include/cuvs/spectral/cluster_solvers.cuh
@@ -69,7 +69,7 @@ struct kmeans_solver_t {
     auto centroids =
       raft::make_device_matrix<value_type_t, index_type_t>(handle, config_.n_clusters, dim);
     auto weight = raft::make_device_vector<value_type_t, index_type_t>(handle, n_obs_vecs);
-    thrust::fill(resource::get_thrust_policy(handle),
+    thrust::fill(raft::resource::get_thrust_policy(handle),
                  weight.data_handle(),
                  weight.data_handle() + n_obs_vecs,
                  1);
diff --git a/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp b/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp
index 3cd3eec0b..ebdb9835a 100644
--- a/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp
@@ -93,9 +93,9 @@ struct vector_view_t {
 template <typename value_type>
 class vector_t {
  public:
-  vector_t(resources const& raft_handle, size_type sz)
+  vector_t(raft::resources const& raft_handle, size_type sz)
     : buffer_(sz, resource::get_cuda_stream(raft_handle)),
-      thrust_policy(resource::get_thrust_policy(raft_handle))
+      thrust_policy(raft::resource::get_thrust_policy(raft_handle))
   {
   }
 
@@ -133,7 +133,7 @@ class vector_t {
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(resources const& raft_handle,
+  sparse_matrix_t(raft::resources const& raft_handle,
                   index_type const* row_offsets,
                   index_type const* col_indices,
                   value_type const* values,
@@ -150,7 +150,7 @@ struct sparse_matrix_t {
   {
   }
 
-  sparse_matrix_t(resources const& raft_handle,
+  sparse_matrix_t(raft::resources const& raft_handle,
                   index_type const* row_offsets,
                   index_type const* col_indices,
                   value_type const* values,
@@ -167,7 +167,7 @@ struct sparse_matrix_t {
   }
 
   template <typename CSRView>
-  sparse_matrix_t(resources const& raft_handle, CSRView const& csr_view)
+  sparse_matrix_t(raft::resources const& raft_handle, CSRView const& csr_view)
     : handle_(raft_handle),
       row_offsets_(csr_view.offsets),
       col_indices_(csr_view.indices),
@@ -313,7 +313,7 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  laplacian_matrix_t(resources const& raft_handle,
+  laplacian_matrix_t(raft::resources const& raft_handle,
                      index_type const* row_offsets,
                      index_type const* col_indices,
                      value_type const* values,
@@ -328,7 +328,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
-  laplacian_matrix_t(resources const& raft_handle,
+  laplacian_matrix_t(raft::resources const& raft_handle,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
     : sparse_matrix_t<index_type, value_type>(raft_handle,
                                               csr_m.row_offsets_,
@@ -387,7 +387,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
 template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
-  modularity_matrix_t(resources const& raft_handle,
+  modularity_matrix_t(raft::resources const& raft_handle,
                       index_type const* row_offsets,
                       index_type const* col_indices,
                       value_type const* values,
@@ -399,7 +399,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
-  modularity_matrix_t(resources const& raft_handle,
+  modularity_matrix_t(raft::resources const& raft_handle,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
     : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m)
   {
diff --git a/cpp/include/cuvs/spectral/eigen_solvers.cuh b/cpp/include/cuvs/spectral/eigen_solvers.cuh
index be8be83cd..59e0c0d96 100644
--- a/cpp/include/cuvs/spectral/eigen_solvers.cuh
+++ b/cpp/include/cuvs/spectral/eigen_solvers.cuh
@@ -51,7 +51,7 @@ struct lanczos_solver_t {
 
   index_type_t solve_smallest_eigenvectors(
     raft::resources const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    raft::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
   {
@@ -74,7 +74,7 @@ struct lanczos_solver_t {
 
   index_type_t solve_largest_eigenvectors(
     raft::resources const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    raft::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
   {
diff --git a/cpp/include/cuvs/spectral/modularity_maximization.cuh b/cpp/include/cuvs/spectral/modularity_maximization.cuh
index 2a15b8fe1..6cee2086d 100644
--- a/cpp/include/cuvs/spectral/modularity_maximization.cuh
+++ b/cpp/include/cuvs/spectral/modularity_maximization.cuh
@@ -46,7 +46,7 @@ namespace spectral {
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   raft::resources const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  raft::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
   vertex_t* __restrict__ clusters,
@@ -71,7 +71,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  */
 template <typename vertex_t, typename weight_t>
 void analyzeModularity(raft::resources const& handle,
-                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       raft::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
                        weight_t& modularity)
diff --git a/cpp/include/cuvs/spectral/partition.cuh b/cpp/include/cuvs/spectral/partition.cuh
index c16dd4ef0..3f327dbfb 100644
--- a/cpp/include/cuvs/spectral/partition.cuh
+++ b/cpp/include/cuvs/spectral/partition.cuh
@@ -48,7 +48,7 @@ namespace spectral {
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
   raft::resources const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  raft::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
   vertex_t* __restrict__ clusters,
@@ -79,7 +79,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  */
 template <typename vertex_t, typename weight_t>
 void analyzePartition(raft::resources const& handle,
-                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      raft::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
diff --git a/cpp/include/cuvs/stats/detail/meanvar.cuh b/cpp/include/cuvs/stats/detail/meanvar.cuh
index 1ebaf3b18..c286d5ed9 100644
--- a/cpp/include/cuvs/stats/detail/meanvar.cuh
+++ b/cpp/include/cuvs/stats/detail/meanvar.cuh
@@ -100,7 +100,7 @@ NB: current implementation here is not optimal, especially the rowmajor version;
  *
  * Assumptions:
  *
- *  1. blockDim.x == WarpSize
+ *  1. blockDim.x == raft::WarpSize
  *  2. Dimension X goes along columns (D)
  *  3. Dimension Y goes along rows (N)
  *
@@ -197,8 +197,9 @@ void meanvar(
   T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream)
 {
   if (rowMajor) {
-    static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size.");
-    const dim3 bs(WarpSize, BlockSize / WarpSize, 1);
+    static_assert(BlockSize >= raft::WarpSize,
+                  "Block size must be not smaller than the warp size.");
+    const dim3 bs(WarpSize, BlockSize / raft::WarpSize, 1);
     dim3 gs(raft::ceildiv<decltype(bs.x)>(D, bs.x), raft::ceildiv<decltype(bs.y)>(N, bs.y), 1);
 
     // Don't create more blocks than necessary to occupy the GPU