diff --git a/cpp/include/cugraph/shuffle_functions.hpp b/cpp/include/cugraph/shuffle_functions.hpp
index 08f85397ed0..b87e75b2e5f 100644
--- a/cpp/include/cugraph/shuffle_functions.hpp
+++ b/cpp/include/cugraph/shuffle_functions.hpp
@@ -153,4 +153,94 @@ shuffle_ext_edges(raft::handle_t const& handle,
                   std::optional<rmm::device_uvector<edge_time_t>>&& edge_end_times,
                   bool store_transposed);
 
+/**
+ * @brief Shuffle local edge sources (already placed by edge partitioning) to the owning GPUs (by
+ * vertex partitioning).
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_srcs  Vector of local edge source IDs
+ * @param vertex_partition_range_lasts  Span of vertex partition range lasts (size = number of GPUs)
+ * @param store_transposed Should be true if shuffled edges will be used with a cugraph::graph_t
+ * object with store_tranposed = true. Should be false otherwise.
+ * @return Vector of shuffled edge source vertex IDs (shuffled by vertex partitioning).
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_local_edge_srcs(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edge_srcs,
+  raft::host_span<vertex_t const> vertex_partition_range_lasts,
+  bool store_transposed);
+
+/**
+ * @brief Shuffle local edge source & value pairs (already placed by edge partitioning) to the
+ * owning GPUs (by vertex partitioning).
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_srcs  Vector of local edge source IDs
+ * @param edge_values  Vector of edge values
+ * @param vertex_partition_range_lasts  Span of vertex partition range lasts (size = number of GPUs)
+ * @param store_transposed Should be true if shuffled edges will be used with a cugraph::graph_t
+ * object with store_tranposed = true. Should be false otherwise.
+ * @return Tuple of vectors storing shuffled edge source vertex IDs and values (shuffled by vertex
+ * partitioning).
+ */
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, dataframe_buffer_type_t<value_t>>
+shuffle_local_edge_src_value_pairs(raft::handle_t const& handle,
+                                   rmm::device_uvector<vertex_t>&& edge_srcs,
+                                   dataframe_buffer_type_t<value_t>&& edge_values,
+                                   raft::host_span<vertex_t const> vertex_partition_range_lasts,
+                                   bool store_transposed);
+
+/**
+ * @brief Shuffle local edge destinations (already placed by edge partitioning) to the owning GPUs
+ * (by vertex partitioning).
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_dsts  Vector of local edge destination IDs
+ * @param vertex_partition_range_lasts  Span of vertex partition range lasts (size = number of GPUs)
+ * @param store_transposed Should be true if shuffled edges will be used with a cugraph::graph_t
+ * object with store_tranposed = true. Should be false otherwise.
+ * @return Vector of shuffled edge destination vertex IDs (shuffled by vertex partitioning).
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_local_edge_dsts(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edge_dsts,
+  raft::host_span<vertex_t const> vertex_partition_range_lasts,
+  bool store_transposed);
+
+/**
+ * @brief Shuffle local edge destination & value pairs (already placed by edge partitioning) to the
+ * owning GPUs (by vertex partitioning).
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_dsts  Vector of local edge destination IDs
+ * @param edge_values  Vector of edge values
+ * @param vertex_partition_range_lasts  Span of vertex partition range lasts (size = number of GPUs)
+ * @param store_transposed Should be true if shuffled edges will be used with a cugraph::graph_t
+ * object with store_tranposed = true. Should be false otherwise.
+ * @return Tuple of vectors storing shuffled edge destination vertex IDs and values (shuffled by
+ * vertex partitioning).
+ */
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, dataframe_buffer_type_t<value_t>>
+shuffle_local_edge_dst_value_pairs(raft::handle_t const& handle,
+                                   rmm::device_uvector<vertex_t>&& edge_dsts,
+                                   dataframe_buffer_type_t<value_t>&& edge_values,
+                                   raft::host_span<vertex_t const> vertex_partition_range_lasts,
+                                   bool store_transposed);
+
 }  // namespace cugraph
diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh
index 60f983e5877..01c152c0423 100644
--- a/cpp/src/prims/fill_edge_src_dst_property.cuh
+++ b/cpp/src/prims/fill_edge_src_dst_property.cuh
@@ -194,7 +194,7 @@ void fill_edge_major_property(raft::handle_t const& handle,
             if ((it != edge_partition_key_last) && (*it == major)) {
               auto edge_partition_offset = thrust::distance(edge_partition_key_first, it);
               if constexpr (contains_packed_bool_element) {
-                packe_bool_atomic_set(edge_partition_value_first, edge_partition_offset, input);
+                packed_bool_atomic_set(edge_partition_value_first, edge_partition_offset, input);
               } else {
                 *(edge_partition_value_first + edge_partition_offset) = input;
               }
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 914079852b3..480ae658dcb 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -19,7 +19,6 @@
 #include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
 #include "prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh"
-#include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
 
 #include <cugraph/algorithms.hpp>
diff --git a/cpp/tests/traversal/bfs_test.cpp b/cpp/tests/traversal/bfs_test.cpp
index b17b9aee66b..284eb82c721 100644
--- a/cpp/tests/traversal/bfs_test.cpp
+++ b/cpp/tests/traversal/bfs_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -135,16 +135,6 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
         cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
       graph_view.attach_edge_mask((*edge_mask).view());
     }
-    {  // FIXME: for testing, delete
-      auto num_self_loops  = graph_view.count_self_loops(handle);
-      auto number_of_edges = graph_view.compute_number_of_edges(handle);
-      std::cout << "V=" << graph_view.number_of_vertices() << " E=" << number_of_edges
-                << " num_self_loops=" << num_self_loops;
-      if (graph_view.is_symmetric()) {
-        std::cout << " undirected E=" << ((number_of_edges - num_self_loops) / 2 + num_self_loops)
-                  << std::endl;
-      }
-    }
 
     ASSERT_TRUE(static_cast<vertex_t>(bfs_usecase.source) >= 0 &&
                 static_cast<vertex_t>(bfs_usecase.source) < graph_view.number_of_vertices())
@@ -214,8 +204,7 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
                                                d_predecessors.size(),
                                                (*d_renumber_map_labels).data(),
                                                vertex_t{0},
-                                               graph_view.number_of_vertices(),
-                                               true);
+                                               graph_view.number_of_vertices());
 
         rmm::device_uvector<vertex_t> d_unrenumbered_distances(size_t{0}, handle.get_stream());
         std::tie(std::ignore, d_unrenumbered_distances) =