diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d35c5c02218..ebffd18ca5a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,3 +1,17 @@
+# https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
+# Order matters - match of highest importance goes last (last match wins)
+
+#doc code owners
+datasets/          @rapidsai/cugraph-doc-codeowners
+notebooks/         @rapidsai/cugraph-doc-codeowners
+docs/              @rapidsai/cugraph-doc-codeowners
+**/*.txt           @rapidsai/cugraph-doc-codeowners
+**/*.md            @rapidsai/cugraph-doc-codeowners
+**/*.rst           @rapidsai/cugraph-doc-codeowners
+**/*.ipynb         @rapidsai/cugraph-doc-codeowners
+**/*.pdf           @rapidsai/cugraph-doc-codeowners
+**/*.png           @rapidsai/cugraph-doc-codeowners
+
 #cpp code owners
 cpp/               @rapidsai/cugraph-cpp-codeowners
 
@@ -9,7 +23,7 @@ python/            @rapidsai/cugraph-python-codeowners
 **/cmake/          @rapidsai/cugraph-cmake-codeowners
 
 #build/ops code owners
-.github/           @rapidsai/ops-codeowners 
+.github/           @rapidsai/ops-codeowners
 ci/                @rapidsai/ops-codeowners
 conda/             @rapidsai/ops-codeowners
 **/Dockerfile      @rapidsai/ops-codeowners
diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 00000000000..9c3af6de64b
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,37 @@
+# https://github.com/actions/labeler#common-examples
+# Adapted from https://github.com/rapidsai/cugraph/blob/main/.github/CODEOWNERS
+# Labels culled from https://github.com/rapidsai/cugraph/labels
+
+python:
+  - 'python/**'
+  - 'notebooks/**'
+
+benchmarks:
+  - 'benchmarks/**'
+  
+doc:
+  - 'docs/**'
+  - '**/*.md'
+  - 'datasets/**'
+  - 'notebooks/**'
+  - '**/*.txt'
+  - '**/*.rst'
+  - '**/*.ipynb'
+  - '**/*.pdf'
+  - '**/*.png'
+  
+datasets:
+  - 'datasets/**'
+  
+cuGraph:
+  - 'cpp/**'
+  
+CMake:
+  - '**/CMakeLists.txt'
+  - '**/cmake/**'
+
+gpuCI:
+   - 'ci/**'
+
+conda:
+  - 'conda/**'
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
new file mode 100644
index 00000000000..8b65da69aa2
--- /dev/null
+++ b/.github/workflows/stale.yaml
@@ -0,0 +1,57 @@
+name: Mark inactive issues and pull requests
+
+on:
+  schedule:
+    - cron: "0 * * * *"
+
+jobs:
+  mark-inactive-30d:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark 30 day inactive issues and pull requests
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: >
+            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            Please close this issue if no further response or action is needed.
+            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
+            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-issue-label: "inactive-30d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-issue-stale: 30
+          days-before-issue-close: -1
+          stale-pr-message: >
+            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            Please close this PR if it is no longer required.
+            Otherwise, please respond with a comment indicating any updates.
+            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-pr-label: "inactive-30d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-pr-stale: 30
+          days-before-pr-close: -1
+          operations-per-run: 50
+  mark-inactive-90d:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark 90 day inactive issues and pull requests
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: >
+            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            Please close this issue if no further response or action is needed.
+            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
+          stale-issue-label: "inactive-90d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-issue-stale: 90
+          days-before-issue-close: -1
+          stale-pr-message: >
+            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            Please close this PR if it is no longer required.
+            Otherwise, please respond with a comment indicating any updates.
+          stale-pr-label: "inactive-90d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-pr-stale: 90
+          days-before-pr-close: -1
+          operations-per-run: 50
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 815ffa86e35..601ac2fb4f8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,228 @@
-# cuGraph 0.17.0 (10 Dec 2020)
+# cuGraph 21.08.00 (4 Aug 2021)
+
+## 🚨 Breaking Changes
+
+- Removed depricated code ([#1705](https://github.com/rapidsai/cugraph/pull/1705)) [@BradReesWork](https://github.com/BradReesWork)
+- Delete legacy renumbering implementation ([#1681](https://github.com/rapidsai/cugraph/pull/1681)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Migrate old graph to legacy directory/namespace ([#1675](https://github.com/rapidsai/cugraph/pull/1675)) [@ChuckHastings](https://github.com/ChuckHastings)
+
+## 🐛 Bug Fixes
+
+- Changed cuco cmake function to return early if cuco has already been added as a target ([#1746](https://github.com/rapidsai/cugraph/pull/1746)) [@rlratzel](https://github.com/rlratzel)
+- revert cuco to latest dev branch, issues should be fixed ([#1721](https://github.com/rapidsai/cugraph/pull/1721)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix `conda` uploads ([#1712](https://github.com/rapidsai/cugraph/pull/1712)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Updated for CUDA-specific py packages ([#1709](https://github.com/rapidsai/cugraph/pull/1709)) [@rlratzel](https://github.com/rlratzel)
+- Use `library_dirs` for cython linking, link cudatoolkit libs, allow setting UCX install location ([#1698](https://github.com/rapidsai/cugraph/pull/1698)) [@trxcllnt](https://github.com/trxcllnt)
+- Fix the Louvain failure with 64 bit vertex IDs ([#1696](https://github.com/rapidsai/cugraph/pull/1696)) [@seunghwak](https://github.com/seunghwak)
+- Use nested include in destination of install headers to avoid docker permission issues ([#1656](https://github.com/rapidsai/cugraph/pull/1656)) [@dantegd](https://github.com/dantegd)
+- Added accidentally-removed cpp-mgtests target back to the valid args list ([#1652](https://github.com/rapidsai/cugraph/pull/1652)) [@rlratzel](https://github.com/rlratzel)
+- Update UCX-Py version to 0.21 ([#1650](https://github.com/rapidsai/cugraph/pull/1650)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Docs for RMAT ([#1735](https://github.com/rapidsai/cugraph/pull/1735)) [@BradReesWork](https://github.com/BradReesWork)
+- Doc updates ([#1719](https://github.com/rapidsai/cugraph/pull/1719)) [@BradReesWork](https://github.com/BradReesWork)
+
+## 🚀 New Features
+
+- Fea cleanup stream part1 ([#1653](https://github.com/rapidsai/cugraph/pull/1653)) [@ChuckHastings](https://github.com/ChuckHastings)
+
+## 🛠️ Improvements
+
+- Pinning cuco to a specific commit hash for release ([#1741](https://github.com/rapidsai/cugraph/pull/1741)) [@rlratzel](https://github.com/rlratzel)
+- Pin max version for `dask` &amp; `distributed` ([#1736](https://github.com/rapidsai/cugraph/pull/1736)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix libfaiss dependency to not expressly depend on conda-forge ([#1728](https://github.com/rapidsai/cugraph/pull/1728)) [@Ethyling](https://github.com/Ethyling)
+- Fix MG_test bug ([#1718](https://github.com/rapidsai/cugraph/pull/1718)) [@jnke2016](https://github.com/jnke2016)
+- Cascaded dispatch for type-erased API ([#1711](https://github.com/rapidsai/cugraph/pull/1711)) [@aschaffer](https://github.com/aschaffer)
+- ReduceV test ([#1710](https://github.com/rapidsai/cugraph/pull/1710)) [@kaatish](https://github.com/kaatish)
+- Removed depricated code ([#1705](https://github.com/rapidsai/cugraph/pull/1705)) [@BradReesWork](https://github.com/BradReesWork)
+- Delete unused/out-dated primitives ([#1704](https://github.com/rapidsai/cugraph/pull/1704)) [@seunghwak](https://github.com/seunghwak)
+- Update primitives to support DCSR (DCSC) segments (Part 2/2) ([#1703](https://github.com/rapidsai/cugraph/pull/1703)) [@seunghwak](https://github.com/seunghwak)
+- Fea speedup compile ([#1702](https://github.com/rapidsai/cugraph/pull/1702)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Update `conda` environment name for CI ([#1699](https://github.com/rapidsai/cugraph/pull/1699)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Count if test ([#1697](https://github.com/rapidsai/cugraph/pull/1697)) [@kaatish](https://github.com/kaatish)
+- replace cudf assert_eq ([#1693](https://github.com/rapidsai/cugraph/pull/1693)) [@jnke2016](https://github.com/jnke2016)
+- Fix int64 vertex_t ([#1691](https://github.com/rapidsai/cugraph/pull/1691)) [@Iroy30](https://github.com/Iroy30)
+- Update primitives to support DCSR (DCSC) segments (Part 1) ([#1690](https://github.com/rapidsai/cugraph/pull/1690)) [@seunghwak](https://github.com/seunghwak)
+- remove hardcoded dtype ([#1689](https://github.com/rapidsai/cugraph/pull/1689)) [@Iroy30](https://github.com/Iroy30)
+- Updating Clang Version to 11.0.0 ([#1688](https://github.com/rapidsai/cugraph/pull/1688)) [@codereport](https://github.com/codereport)
+- `CHECK_CUDA` macros in debug builds ([#1687](https://github.com/rapidsai/cugraph/pull/1687)) [@trxcllnt](https://github.com/trxcllnt)
+- fixing symmetrize_ddf ([#1686](https://github.com/rapidsai/cugraph/pull/1686)) [@jnke2016](https://github.com/jnke2016)
+- Improve Random Walks performance ([#1685](https://github.com/rapidsai/cugraph/pull/1685)) [@aschaffer](https://github.com/aschaffer)
+- Use the 21.08 branch of rapids-cmake as rmm requires it ([#1683](https://github.com/rapidsai/cugraph/pull/1683)) [@robertmaynard](https://github.com/robertmaynard)
+- Delete legacy renumbering implementation ([#1681](https://github.com/rapidsai/cugraph/pull/1681)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix vertex partition offsets ([#1680](https://github.com/rapidsai/cugraph/pull/1680)) [@Iroy30](https://github.com/Iroy30)
+- Ues std::optional (or thrust::optional) for optional parameters &amp; first part of DCSR (DCSC) implementation. ([#1676](https://github.com/rapidsai/cugraph/pull/1676)) [@seunghwak](https://github.com/seunghwak)
+- Migrate old graph to legacy directory/namespace ([#1675](https://github.com/rapidsai/cugraph/pull/1675)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Expose epsilon parameter (precision) through python layer ([#1674](https://github.com/rapidsai/cugraph/pull/1674)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fea hungarian expose precision ([#1673](https://github.com/rapidsai/cugraph/pull/1673)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Branch 21.08 merge 21.06 ([#1672](https://github.com/rapidsai/cugraph/pull/1672)) [@BradReesWork](https://github.com/BradReesWork)
+- Update pins to Dask/Distributed &gt;= 2021.6.0 ([#1666](https://github.com/rapidsai/cugraph/pull/1666)) [@pentschev](https://github.com/pentschev)
+- Fix conflicts in `1643` ([#1651](https://github.com/rapidsai/cugraph/pull/1651)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Rename include/cugraph/patterns to include/cugraph/prims ([#1644](https://github.com/rapidsai/cugraph/pull/1644)) [@seunghwak](https://github.com/seunghwak)
+- Fix merge conflicts in 1631 ([#1639](https://github.com/rapidsai/cugraph/pull/1639)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update to changed `rmm::device_scalar` API ([#1637](https://github.com/rapidsai/cugraph/pull/1637)) [@harrism](https://github.com/harrism)
+- Fix merge conflicts ([#1614](https://github.com/rapidsai/cugraph/pull/1614)) [@ajschmidt8](https://github.com/ajschmidt8)
+
+# cuGraph 21.06.00 (9 Jun 2021)
+
+## 🐛 Bug Fixes
+
+- Delete CUDA_ARCHITECTURES=OFF ([#1638](https://github.com/rapidsai/cugraph/pull/1638)) [@seunghwak](https://github.com/seunghwak)
+- transform_reduce_e bug fixes ([#1633](https://github.com/rapidsai/cugraph/pull/1633)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Correct install path for include folder to avoid double nesting ([#1630](https://github.com/rapidsai/cugraph/pull/1630)) [@dantegd](https://github.com/dantegd)
+- Remove thread local thrust::sort (thrust::sort with the execution policy thrust::seq) from copy_v_transform_reduce_key_aggregated_out_nbr ([#1627](https://github.com/rapidsai/cugraph/pull/1627)) [@seunghwak](https://github.com/seunghwak)
+
+## 🚀 New Features
+
+- SG &amp; MG Weakly Connected Components ([#1604](https://github.com/rapidsai/cugraph/pull/1604)) [@seunghwak](https://github.com/seunghwak)
+
+## 🛠️ Improvements
+
+- Remove Pascal guard and test cuGraph use of cuco::static_map on Pascal ([#1640](https://github.com/rapidsai/cugraph/pull/1640)) [@seunghwak](https://github.com/seunghwak)
+- Upgraded recipe and dev envs to NCCL 2.9.9 ([#1636](https://github.com/rapidsai/cugraph/pull/1636)) [@rlratzel](https://github.com/rlratzel)
+- Use UCX-Py 0.20 ([#1634](https://github.com/rapidsai/cugraph/pull/1634)) [@jakirkham](https://github.com/jakirkham)
+- Updated dependencies for CalVer ([#1629](https://github.com/rapidsai/cugraph/pull/1629)) [@rlratzel](https://github.com/rlratzel)
+- MG WCC improvements ([#1628](https://github.com/rapidsai/cugraph/pull/1628)) [@seunghwak](https://github.com/seunghwak)
+- Initialize force_atlas2 `old_forces` device_uvector, use new `rmm::exec_policy` ([#1625](https://github.com/rapidsai/cugraph/pull/1625)) [@trxcllnt](https://github.com/trxcllnt)
+- Fix developer guide examples for device_buffer ([#1619](https://github.com/rapidsai/cugraph/pull/1619)) [@harrism](https://github.com/harrism)
+- Pass rmm memory allocator to cuco::static_map ([#1617](https://github.com/rapidsai/cugraph/pull/1617)) [@seunghwak](https://github.com/seunghwak)
+- Undo disabling MG C++ testing outputs for non-root processes ([#1615](https://github.com/rapidsai/cugraph/pull/1615)) [@seunghwak](https://github.com/seunghwak)
+- WCC bindings ([#1612](https://github.com/rapidsai/cugraph/pull/1612)) [@Iroy30](https://github.com/Iroy30)
+- address &#39;ValueError: Series contains NULL values&#39; from from_cudf_edge… ([#1610](https://github.com/rapidsai/cugraph/pull/1610)) [@mattf](https://github.com/mattf)
+- Fea rmm device buffer change ([#1609](https://github.com/rapidsai/cugraph/pull/1609)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Update `CHANGELOG.md` links for calver ([#1608](https://github.com/rapidsai/cugraph/pull/1608)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Handle int64 in force atlas wrapper and update to uvector ([#1607](https://github.com/rapidsai/cugraph/pull/1607)) [@hlinsen](https://github.com/hlinsen)
+- Update docs build script ([#1606](https://github.com/rapidsai/cugraph/pull/1606)) [@ajschmidt8](https://github.com/ajschmidt8)
+- WCC performance/memory footprint optimization ([#1605](https://github.com/rapidsai/cugraph/pull/1605)) [@seunghwak](https://github.com/seunghwak)
+- adding test graphs - part 2 ([#1603](https://github.com/rapidsai/cugraph/pull/1603)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Update the Random Walk binding ([#1599](https://github.com/rapidsai/cugraph/pull/1599)) [@Iroy30](https://github.com/Iroy30)
+- Add mnmg out degree ([#1592](https://github.com/rapidsai/cugraph/pull/1592)) [@Iroy30](https://github.com/Iroy30)
+- Update `cugraph` to with newest CMake features, including CPM for dependencies ([#1585](https://github.com/rapidsai/cugraph/pull/1585)) [@robertmaynard](https://github.com/robertmaynard)
+- Implement Graph Batching functionality ([#1580](https://github.com/rapidsai/cugraph/pull/1580)) [@aschaffer](https://github.com/aschaffer)
+- add multi-column support in algorithms - part 2 ([#1571](https://github.com/rapidsai/cugraph/pull/1571)) [@Iroy30](https://github.com/Iroy30)
+
+# cuGraph 0.19.0 (21 Apr 2021)
+
+## 🐛 Bug Fixes
+
+- Fixed copyright date and format ([#1526](https://github.com//rapidsai/cugraph/pull/1526)) [@rlratzel](https://github.com/rlratzel)
+- fix mg_renumber non-deterministic errors ([#1523](https://github.com//rapidsai/cugraph/pull/1523)) [@Iroy30](https://github.com/Iroy30)
+- Updated NetworkX version to 2.5.1 ([#1510](https://github.com//rapidsai/cugraph/pull/1510)) [@rlratzel](https://github.com/rlratzel)
+- pascal renumbering fix ([#1505](https://github.com//rapidsai/cugraph/pull/1505)) [@Iroy30](https://github.com/Iroy30)
+- Fix MNMG test failures and skip tests that are not supported on Pascal ([#1498](https://github.com//rapidsai/cugraph/pull/1498)) [@jnke2016](https://github.com/jnke2016)
+- Revert &quot;Update conda recipes pinning of repo dependencies&quot; ([#1493](https://github.com//rapidsai/cugraph/pull/1493)) [@raydouglass](https://github.com/raydouglass)
+- Update conda recipes pinning of repo dependencies ([#1485](https://github.com//rapidsai/cugraph/pull/1485)) [@mike-wendt](https://github.com/mike-wendt)
+- Update to make notebook_list.py compatible with numba 0.53 ([#1455](https://github.com//rapidsai/cugraph/pull/1455)) [@rlratzel](https://github.com/rlratzel)
+- Fix bugs in copy_v_transform_reduce_key_aggregated_out_nbr &amp; groupby_gpuid_and_shuffle ([#1434](https://github.com//rapidsai/cugraph/pull/1434)) [@seunghwak](https://github.com/seunghwak)
+- update default path of setup to use the new directory paths in build … ([#1425](https://github.com//rapidsai/cugraph/pull/1425)) [@ChuckHastings](https://github.com/ChuckHastings)
+
+## 📖 Documentation
+
+- Create C++ documentation ([#1489](https://github.com//rapidsai/cugraph/pull/1489)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Create cuGraph developers guide ([#1431](https://github.com//rapidsai/cugraph/pull/1431)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Add boost 1.0 license file. ([#1401](https://github.com//rapidsai/cugraph/pull/1401)) [@seunghwak](https://github.com/seunghwak)
+
+## 🚀 New Features
+
+- Implement C/CUDA RandomWalks functionality ([#1439](https://github.com//rapidsai/cugraph/pull/1439)) [@aschaffer](https://github.com/aschaffer)
+- Add R-mat generator ([#1411](https://github.com//rapidsai/cugraph/pull/1411)) [@seunghwak](https://github.com/seunghwak)
+
+## 🛠️ Improvements
+
+- Random Walks - Python Bindings ([#1516](https://github.com//rapidsai/cugraph/pull/1516)) [@jnke2016](https://github.com/jnke2016)
+- Updating RAFT tag ([#1509](https://github.com//rapidsai/cugraph/pull/1509)) [@afender](https://github.com/afender)
+- Clean up nullptr cuda_stream_view arguments ([#1504](https://github.com//rapidsai/cugraph/pull/1504)) [@hlinsen](https://github.com/hlinsen)
+- Reduce the size of the cugraph libraries ([#1503](https://github.com//rapidsai/cugraph/pull/1503)) [@robertmaynard](https://github.com/robertmaynard)
+- Add indirection and replace algorithms with new renumbering ([#1484](https://github.com//rapidsai/cugraph/pull/1484)) [@Iroy30](https://github.com/Iroy30)
+- Multiple graph generator with power law distribution on sizes ([#1483](https://github.com//rapidsai/cugraph/pull/1483)) [@afender](https://github.com/afender)
+- TSP solver bug fix ([#1480](https://github.com//rapidsai/cugraph/pull/1480)) [@hlinsen](https://github.com/hlinsen)
+- Added cmake function and .hpp template for generating version_config.hpp file. ([#1476](https://github.com//rapidsai/cugraph/pull/1476)) [@rlratzel](https://github.com/rlratzel)
+- Fix for bug in SCC on self-loops ([#1475](https://github.com//rapidsai/cugraph/pull/1475)) [@aschaffer](https://github.com/aschaffer)
+- MS BFS python APIs + EgoNet updates ([#1469](https://github.com//rapidsai/cugraph/pull/1469)) [@afender](https://github.com/afender)
+- Removed unused dependencies from libcugraph recipe, moved non-test script code from test script to gpu build script ([#1468](https://github.com//rapidsai/cugraph/pull/1468)) [@rlratzel](https://github.com/rlratzel)
+- Remove literals passed to `device_uvector::set_element_async` ([#1453](https://github.com//rapidsai/cugraph/pull/1453)) [@harrism](https://github.com/harrism)
+- ENH Change conda build directories to work with ccache ([#1452](https://github.com//rapidsai/cugraph/pull/1452)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Updating docs ([#1448](https://github.com//rapidsai/cugraph/pull/1448)) [@BradReesWork](https://github.com/BradReesWork)
+- Improve graph primitives performance on graphs with widely varying vertex degrees ([#1447](https://github.com//rapidsai/cugraph/pull/1447)) [@seunghwak](https://github.com/seunghwak)
+- Update Changelog Link ([#1446](https://github.com//rapidsai/cugraph/pull/1446)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Updated NCCL to version 2.8.4 ([#1445](https://github.com//rapidsai/cugraph/pull/1445)) [@BradReesWork](https://github.com/BradReesWork)
+- Update FAISS to 1.7.0 ([#1444](https://github.com//rapidsai/cugraph/pull/1444)) [@BradReesWork](https://github.com/BradReesWork)
+- Update graph partitioning scheme ([#1443](https://github.com//rapidsai/cugraph/pull/1443)) [@seunghwak](https://github.com/seunghwak)
+- Add additional datasets to improve coverage ([#1441](https://github.com//rapidsai/cugraph/pull/1441)) [@jnke2016](https://github.com/jnke2016)
+- Update C++ MG PageRank and SG PageRank, Katz Centrality, BFS, and SSSP to use the new R-mat graph generator ([#1438](https://github.com//rapidsai/cugraph/pull/1438)) [@seunghwak](https://github.com/seunghwak)
+- Remove raft handle duplication ([#1436](https://github.com//rapidsai/cugraph/pull/1436)) [@Iroy30](https://github.com/Iroy30)
+- Streams infra + support in egonet ([#1435](https://github.com//rapidsai/cugraph/pull/1435)) [@afender](https://github.com/afender)
+- Prepare Changelog for Automation ([#1433](https://github.com//rapidsai/cugraph/pull/1433)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update 0.18 changelog entry ([#1429](https://github.com//rapidsai/cugraph/pull/1429)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update and Test Renumber bindings ([#1427](https://github.com//rapidsai/cugraph/pull/1427)) [@Iroy30](https://github.com/Iroy30)
+- Update Louvain to use new graph primitives and pattern accelerators ([#1423](https://github.com//rapidsai/cugraph/pull/1423)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Replace rmm::device_vector &amp; thrust::host_vector with rmm::device_uvector &amp; std::vector, respectively. ([#1421](https://github.com//rapidsai/cugraph/pull/1421)) [@seunghwak](https://github.com/seunghwak)
+- Update C++ MG PageRank test ([#1419](https://github.com//rapidsai/cugraph/pull/1419)) [@seunghwak](https://github.com/seunghwak)
+- ENH Build with `cmake --build` &amp; Pass ccache variables to conda recipe &amp; use Ninja in CI ([#1415](https://github.com//rapidsai/cugraph/pull/1415)) [@Ethyling](https://github.com/Ethyling)
+- Adding new primitives: copy_v_transform_reduce_key_aggregated_out_nbr &amp; transform_reduce_by_adj_matrix_row|col_key_e bug fixes ([#1399](https://github.com//rapidsai/cugraph/pull/1399)) [@seunghwak](https://github.com/seunghwak)
+- Add new primitives: compute_in|out_degrees, compute_in|out_weight_sums to graph_view_t ([#1394](https://github.com//rapidsai/cugraph/pull/1394)) [@seunghwak](https://github.com/seunghwak)
+- Rename sort_and_shuffle to groupby_gpuid_and_shuffle ([#1392](https://github.com//rapidsai/cugraph/pull/1392)) [@seunghwak](https://github.com/seunghwak)
+- Matching updates for RAFT comms updates (device_sendrecv, device_multicast_sendrecv, gather, gatherv) ([#1391](https://github.com//rapidsai/cugraph/pull/1391)) [@seunghwak](https://github.com/seunghwak)
+- Fix forward-merge conflicts for #1370 ([#1377](https://github.com//rapidsai/cugraph/pull/1377)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Add utility function for computing a secondary cost for BFS and SSSP output ([#1376](https://github.com//rapidsai/cugraph/pull/1376)) [@hlinsen](https://github.com/hlinsen)
+
+# cuGraph 0.18.0 (24 Feb 2021)
+
+## Bug Fixes 🐛
+
+- Fixed TSP returned routes (#1412) @hlinsen
+- Updated CI scripts to use a different error handling convention, updated LD_LIBRARY_PATH for project flash runs (#1386) @rlratzel
+- Bug fixes for MNMG coarsen_graph, renumber_edgelist, relabel (#1364) @seunghwak
+- Set a specific known working commit hash for gunrock instead of &quot;dev&quot; (#1336) @rlratzel
+- Updated git utils used by copyright.py for compatibility with current CI env (#1325) @rlratzel
+- Fix MNMG Louvain tests on Pascal architecture (#1322) @ChuckHastings
+- FIX Set bash trap after PATH is updated (#1321) @dillon-cullinan
+- Fix graph nodes function and renumbering from series (#1319) @Iroy30
+- Fix Branch 0.18 merge 0.17 (#1314) @BradReesWork
+- Fix EXPERIMENTAL_LOUVAIN_TEST on Pascal (#1312) @ChuckHastings
+- Updated cuxfilter to 0.18, removed datashader indirect dependency in conda dev .yml files (#1311) @rlratzel
+- Update SG PageRank C++ tests (#1307) @seunghwak
+
+## Documentation 📖
+
+- Enabled MultiGraph class and tests, updated SOURCEBUILD.md to include the latest build.sh options (#1351) @rlratzel
+
+## New Features 🚀
+
+- New EgoNet extractor (#1365) @afender
+- Implement induced subgraph extraction primitive (SG C++) (#1354) @seunghwak
+
+## Improvements 🛠️
+
+- Update stale GHA with exemptions &amp; new labels (#1413) @mike-wendt
+- Add GHA to mark issues/prs as stale/rotten (#1408) @Ethyling
+- update subgraph tests and remove legacy pagerank (#1378) @Iroy30
+- Update the conda environments and README file (#1369) @BradReesWork
+- Prepare Changelog for Automation (#1368) @ajschmidt8
+- Update CMakeLists.txt files for consistency with RAPIDS and to support cugraph as an external project and other tech debt removal (#1367) @rlratzel
+- Use new coarsen_graph primitive in Louvain (#1362) @ChuckHastings
+- Added initial infrastructure for MG C++ testing and a Pagerank MG test using it (#1361) @rlratzel
+- Add SG TSP (#1360) @hlinsen
+- Build a Dendrogram class, adapt Louvain/Leiden/ECG to use it (#1359) @ChuckHastings
+- Auto-label PRs based on their content (#1358) @jolorunyomi
+- Implement MNMG Renumber (#1355) @aschaffer
+- Enabling pytest code coverage output by default (#1352) @jnke2016
+- Added configuration for new cugraph-doc-codeowners review group (#1344) @rlratzel
+- API update to match RAFT PR #120 (#1343) @drobison00
+- Pin gunrock to v1.2 for version 0.18 (#1342) @ChuckHastings
+- Fix #1340 - Use generic from_edgelist() methods (#1341) @miguelusque
+- Using RAPIDS_DATASET_ROOT_DIR env var in place of absolute path to datasets in tests (#1337) @jnke2016
+- Expose dense implementation of Hungarian algorithm (#1333) @ChuckHastings
+- SG Pagerank transition (#1332) @Iroy30
+- improving error checking and docs (#1327) @BradReesWork
+- Fix MNMG cleanup exceptions (#1326) @Iroy30
+- Create labeler.yml (#1318) @jolorunyomi
+- Updates to support nightly MG test automation (#1308) @rlratzel
+- Add C++ graph functions (coarsen_grpah, renumber_edgelist, relabel) and primitvies (transform_reduce_by_adj_matrix_row_key, transform_reduce_by_adj_matrix_col_key, copy_v_transform_reduce_key_aggregated_out_nbr) (#1257) @seunghwak
+>>>>>>> upstream/branch-0.18
 
+# cuGraph 0.17.0 (10 Dec 2020)
 ## New Features
 - PR #1276 MST
 - PR #1245 Add functions to add pandas and numpy compatibility
@@ -10,6 +233,7 @@
 - PR #1279 Add self loop check variable in graph
 - PR #1277 SciPy sparse matrix input support for WCC, SCC, SSSP, and BFS
 - PR #1278 Add support for shortest_path_length and fix graph vertex checks
+- PR #1280 Add Multi(Di)Graph support
 
 ## Improvements
 - PR #1227 Pin cmake policies to cmake 3.17 version
diff --git a/README.md b/README.md
index 8fee5451ac3..76015f528f6 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ The [RAPIDS](https://rapids.ai) cuGraph library is a collection of GPU accelerat
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cugraph/blob/main/README.md) ensure you are on the latest branch.
 
-
+As an example, the following Python snippet loads graph data and computes PageRank:
 
 ```python
 import cugraph
@@ -30,18 +30,30 @@ for i in range(len(df_page)):
 		" PageRank is " + str(df_page['pagerank'].iloc[i]))
 ```
 
+## Getting cuGraph
+There are 3 ways to get cuGraph :
+1. [Quick start with Docker Repo](#quick)
+2. [Conda Installation](#conda)
+3. [Build from Source](#source)
+<br/><br/>
+
+---
+# Currently Supported Features
+As of Release 21.08 - including 21.08 nightly
+
 
 ## Supported Algorithms
 
-| Category     | Algorithm                              | Scale        |  Notes
+| Category     | Algorithm                              | Scale        |  Notes              |
 | ------------ | -------------------------------------- | ------------ | ------------------- |
 | Centrality   |                                        |              |                     |
 |              | Katz                                   | Multi-GPU    |                     |
 |              | Betweenness Centrality                 | Single-GPU   |                     |
 |              | Edge Betweenness Centrality            | Single-GPU   |                     |
 | Community    |                                        |              |                     |
+|              | EgoNet                                 | Single-GPU   |                     |
 |              | Leiden                                 | Single-GPU   |                     |
-|              | Louvain                                | Multi-GPU    |                     |
+|              | Louvain                                | Multi-GPU    |  [C++ README](cpp/src/community/README.md#Louvain) |
 |              | Ensemble Clustering for Graphs         | Single-GPU   |                     |
 |              | Spectral-Clustering - Balanced Cut     | Single-GPU   |                     |
 |              | Spectral-Clustering - Modularity       | Single-GPU   |                     |
@@ -49,32 +61,35 @@ for i in range(len(df_page)):
 |              | Triangle Counting                      | Single-GPU   |                     |
 |              | K-Truss                                | Single-GPU   |                     |
 | Components   |                                        |              |                     |
-|              | Weakly Connected Components            | Single-GPU   |                     |
+|              | Weakly Connected Components            | Multi-GPU    |                     |
 |              | Strongly Connected Components          | Single-GPU   |                     |
 | Core         |                                        |              |                     |
 |              | K-Core                                 | Single-GPU   |                     |
 |              | Core Number                            | Single-GPU   |                     |
 | Layout       |                                        |              |                     |
 |              | Force Atlas 2                          | Single-GPU   |                     |
+| Linear Assignment|                                    |              |                     |
+|              | Hungarian                              | Single-GPU   | [README](cpp/src/linear_assignment/README-hungarian.md) |
 | Link Analysis|                                        |              |                     |
-|              | Pagerank                               | Multi-GPU    |                     |
-|              | Personal Pagerank                      | Multi-GPU    |                     |
-|              | HITS                      				| Single-GPU   | leverages Gunrock   |
+|              | Pagerank                               | Multi-GPU    | [C++ README](cpp/src/centrality/README.md#Pagerank) |
+|              | Personal Pagerank                      | Multi-GPU    | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank) |
+|              | HITS                                   | Single-GPU   | leverages Gunrock   |
 | Link Prediction |                                     |              |                     |
 |              | Jaccard Similarity                     | Single-GPU   |                     |
 |              | Weighted Jaccard Similarity            | Single-GPU   |                     |
 |              | Overlap Similarity                     | Single-GPU   |                     |
+| Sampling     |                                        |              |                     |
+|              | Random Walks (RW)                      | Single-GPU   |                     |
 | Traversal    |                                        |              |                     |
-|              | Breadth First Search (BFS)             | Multi-GPU    |                     |
-|              | Single Source Shortest Path (SSSP)     | Multi-GPU    |                     |
-| Structure    |                                        |              |                     |
-|              | Renumbering                            | Single-GPU   | multiple columns, any data type  |
-|              | Symmetrize                             | Multi-GPU    |                     |
-| Other        |                                        |              |                     |
-|              | Hungarian Algorithm                    | Single-GPU   |                     |
+|              | Breadth First Search (BFS)             | Multi-GPU    | with cutoff support <br/> [C++ README](cpp/src/traversal/README.md#BFS) |
+|              | Single Source Shortest Path (SSSP)     | Multi-GPU    | [C++ README](cpp/src/traversal/README.md#SSSP) |
+|              | Traveling Salesperson Problem (TSP)    | Single-GPU   |                     |
+| Tree         |                                        |              |                     |
 |              | Minimum Spanning Tree                  | Single-GPU   |                     |
 |              | Maximum Spanning Tree                  | Single-GPU   |                     |
-
+| Other        |                                        |              |                     |
+|              | Renumbering                            | Multi-GPU    | multiple columns, any data type  |
+|              | Symmetrize                             | Multi-GPU    |                     |
 |  |  |
 
 </br></br>
@@ -83,13 +98,16 @@ for i in range(len(df_page)):
 | --------------- | --------------------------------------------------- |
 | Graph           | An undirected Graph                                 |
 | DiGraph         | A Directed Graph                                    |
-| _Multigraph_      | _coming in 0.18_                                      |
-| _MultiDigraph_    | _coming in 0.18_                                      |
+| Multigraph      | A Graph with multiple edges between a vertex pair   |
+| MultiDigraph    | A Directed Graph with multiple edges between a vertex pair   |
 |  |  |
 
+ALL Algorithms support Graphs and MultiGraph (directed and undirected)
+
+
 </br></br>
 ## Supported Data Types
-cuGraph supports the creation of a graph several data types:
+cuGraph supports graph creation with Source and Destination being expressed as:
 * cuDF DataFrame
 * Pandas DataFrame
 
@@ -103,11 +121,8 @@ cuGraph tries to match the return type based on the input type.  So a NetworkX i
 
 
 ## cuGraph Notice
-The current version of cuGraph has some limitations:
 
-- Vertex IDs are expected to be contiguous integers starting from 0.
-
-cuGraph provides the renumber function to mitigate this problem, which is by default automatically called when data is addted to a graph.  Input vertex IDs for the renumber function can be any type, can be non-contiguous, can be multiple columns, and can start from an arbitrary number. The renumber function maps the provided input vertex IDs to 32-bit contiguous integers starting from 0. cuGraph still requires the renumbered vertex IDs to be representable in 32-bit integers. These limitations are being addressed and will be fixed soon.
+Vertex IDs are expected to be contiguous integers starting from 0.  If your data doesn't match that restriction, we have a solution.  cuGraph provides the renumber function, which is by default automatically called when data is addted to a graph.  Input vertex IDs for the renumber function can be any type, can be non-contiguous, can be multiple columns, and can start from an arbitrary number. The renumber function maps the provided input vertex IDs to either 32- or 64-bit contiguous integers starting from 0. 
 
 Additionally, when using the auto-renumbering feature, vertices are automatically un-renumbered in results.
 
@@ -123,42 +138,31 @@ The amount of memory required is dependent on the graph structure and the analyt
 
 The use of managed memory for oversubscription can also be used to exceed the above memory limitations.  See the recent blog on _Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs_:  https://medium.com/rapids-ai/tackling-large-graphs-with-rapids-cugraph-and-unified-virtual-memory-b5b69a065d4
 
+</br></br>
 
-## Getting cuGraph
-### Intro
-There are 3 ways to get cuGraph :
-1. [Quick start with Docker Demo Repo](#quick)
-2. [Conda Installation](#conda)
-3. [Build from Source](#source)
-
-
-
-
+---
 ## Quick Start <a name="quick"></a>
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize all of the RAPIDS libraries: cuDF, cuML, and cuGraph.
+Please see the [Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize all of the RAPIDS libraries: cuDF, cuML, and cuGraph.
 
 
-### Conda <a name="conda"></a>
+## Conda <a name="conda"></a>
 It is easy to install cuGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
 
 Install and update cuGraph using the conda command:
 
 ```bash
 
-# CUDA 10.1
-conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=10.1
-
-# CUDA 10.2
-conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=10.2
-
 # CUDA 11.0
-conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=11.0
+conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.0
+
+# CUDA 11.2
+conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.2
 ```
 
 Note: This conda installation only applies to Linux and Python versions 3.7/3.8.
 
 
-### Build from Source and Contributing <a name="source"></a>
+## Build from Source and Contributing <a name="source"></a>
 
 Please see our [guide for building cuGraph from source](SOURCEBUILD.md)</pr>
 
diff --git a/SOURCEBUILD.md b/SOURCEBUILD.md
index 8acd90c4f7f..47b842a0ce6 100644
--- a/SOURCEBUILD.md
+++ b/SOURCEBUILD.md
@@ -1,25 +1,21 @@
 # Building from Source
 
-The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested. 
+The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested.
 
 The cuGraph package include both a C/C++ CUDA portion and a python portion.  Both libraries need to be installed in order for cuGraph to operate correctly.
 
 ## Prerequisites
 
 __Compiler__:
-* `gcc`         version 5.4+
-* `nvcc`        version 10.0+
-* `cmake`       version 3.12+
+* `gcc`         version 9.3+
+* `nvcc`        version 11.0+
+* `cmake`       version 3.20.1+
 
 __CUDA:__
-* CUDA 10.1+
-* NVIDIA driver 396.44+
+* CUDA 11.0+
+* NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
-__Other__
-* `git`
-
-
 
 You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
 
@@ -47,17 +43,12 @@ __Create the conda development environment__
 ```bash
 # create the conda environment (assuming in base `cugraph` directory)
 
-
-
-# for CUDA 10.1
-conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.1.yml
-
-# for CUDA 10.2
-conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.2.yml
-
-# for CUDA 11
+# for CUDA 11.0
 conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.0.yml
 
+# for CUDA 11.2
+conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.2.yml
+
 # activate the environment
 conda activate cugraph_dev
 
@@ -70,15 +61,12 @@ conda deactivate
 
 ```bash
 
-# for CUDA 10.1
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.1.yml
-
-# for CUDA 10.2
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.2.yml
-
-# for CUDA 11
+# for CUDA 11.0
 conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.0.yml
 
+# for CUDA 11.2
+conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.2.yml
+
 conda activate cugraph_dev
 ```
 
@@ -97,17 +85,21 @@ There are several other options available on the build script for advanced users
 `build.sh` options:
 ```bash
 build.sh [<target> ...] [<flag> ...]
-   clean            - remove all existing build artifacts and configuration (start over)
-   libcugraph       - build the cugraph C++ code
-   cugraph          - build the cugraph Python package
-
+ where <target> is:
+    clean            - remove all existing build artifacts and configuration (start over)
+    libcugraph       - build the cugraph C++ code
+    cugraph          - build the cugraph Python package
+    docs             - build the docs
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
    -n               - no install step
+   --allgpuarch     - build for all supported GPU architectures
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
 
+ default action (no args) is to build and install 'libcugraph' then 'cugraph' then 'docs' targets
+
 examples:
 $ ./build.sh clean                        # remove prior build artifacts (start over)
 $ ./build.sh libcugraph -v                # compile and install libcugraph with verbose output
@@ -189,7 +181,7 @@ Run either the C++ or the Python tests with datasets
 
    ```bash
    cd $CUGRAPH_HOME/datasets
-   source get_test_data.sh #This takes about 10 minutes and download 1GB data (>5 GB uncompressed)
+   source get_test_data.sh #This takes about 10 minutes and downloads 1GB data (>5 GB uncompressed)
    ```
 
    Run the C++ tests on large input:
@@ -228,8 +220,8 @@ Next the env_vars.sh file needs to be edited
 vi ./etc/conda/activate.d/env_vars.sh
 
 #!/bin/bash
-export PATH=/usr/local/cuda-10.1/bin:$PATH # or cuda-10.2 if using CUDA 10.2
-export LD_LIBRARY_PATH=/usr/local/cuda-10.1/lib64:$LD_LIBRARY_PATH # or cuda-10.2 if using CUDA 10.2
+export PATH=/usr/local/cuda-11.0/bin:$PATH # or cuda-11.1 if using CUDA 11.1 and cuda-11.2 if using CUDA 11.2, respectively
+export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH # or cuda-11.1 if using CUDA 11.1 and cuda-11.2 if using CUDA 11.2, respectively
 ```
 
 ```
diff --git a/benchmarks/bench_algos.py b/benchmarks/bench_algos.py
index 9be636ca480..5284ffbd37b 100644
--- a/benchmarks/bench_algos.py
+++ b/benchmarks/bench_algos.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,6 +32,7 @@ def setFixtureParamNames(*args, **kwargs):
 import cugraph
 from cugraph.structure.number_map import NumberMap
 from cugraph.tests import utils
+from cugraph.utilities.utils import is_device_version_less_than
 import rmm
 
 from .params import FIXTURE_PARAMS
@@ -50,9 +51,9 @@ def createGraph(csvFileName, graphType=None):
         # complexity lower, and assume tests have coverage to verify
         # correctness for those combinations.
         if "/directed/" in csvFileName:
-            graphType = cugraph.structure.graph.DiGraph
+            graphType = cugraph.structure.graph_classes.DiGraph
         else:
-            graphType = cugraph.structure.graph.Graph
+            graphType = cugraph.structure.graph_classes.Graph
 
     return cugraph.from_cudf_edgelist(
         utils.read_csv_file(csvFileName),
@@ -121,7 +122,7 @@ def graphWithAdjListComputed(request):
     csvFileName = request.param[0]
     reinitRMM(request.param[1], request.param[2])
 
-    G = createGraph(csvFileName, cugraph.structure.graph.Graph)
+    G = createGraph(csvFileName, cugraph.structure.graph_classes.Graph)
     G.view_adj_list()
     return G
 
@@ -165,7 +166,7 @@ def bench_create_graph(gpubenchmark, edgelistCreated):
     gpubenchmark(cugraph.from_cudf_edgelist,
                  edgelistCreated,
                  source="0", destination="1",
-                 create_using=cugraph.structure.graph.Graph,
+                 create_using=cugraph.structure.graph_classes.Graph,
                  renumber=False)
 
 
@@ -182,7 +183,7 @@ def bench_create_digraph(gpubenchmark, edgelistCreated):
     gpubenchmark(cugraph.from_cudf_edgelist,
                  edgelistCreated,
                  source="0", destination="1",
-                 create_using=cugraph.structure.graph.DiGraph,
+                 create_using=cugraph.structure.graph_classes.DiGraph,
                  renumber=False)
 
 
@@ -212,6 +213,8 @@ def bench_jaccard(gpubenchmark, graphWithAdjListComputed):
     gpubenchmark(cugraph.jaccard, graphWithAdjListComputed)
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal")
 def bench_louvain(gpubenchmark, graphWithAdjListComputed):
     gpubenchmark(cugraph.louvain, graphWithAdjListComputed)
 
diff --git a/build.sh b/build.sh
index b3d3463ed4e..506cc482c59 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 # cugraph build script
 
@@ -19,18 +19,21 @@ ARGS=$*
 REPODIR=$(cd $(dirname $0); pwd)
 LIBCUGRAPH_BUILD_DIR=${LIBCUGRAPH_BUILD_DIR:=${REPODIR}/cpp/build}
 
-VALIDARGS="clean libcugraph cugraph docs -v -g -n --allgpuarch --show_depr_warn -h --help"
+VALIDARGS="clean uninstall libcugraph cugraph cpp-mgtests docs -v -g -n --allgpuarch --buildfaiss --show_depr_warn -h --help"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
+   uninstall        - uninstall libcugraph and cugraph from a prior build/install (see also -n)
    libcugraph       - build the cugraph C++ code
    cugraph          - build the cugraph Python package
+   cpp-mgtests      - build libcugraph mnmg tests. Builds MPI communicator, adding MPI as a dependency.
    docs             - build the docs
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
-   -n               - no install step
+   -n               - do not install after a successful build
    --allgpuarch     - build for all supported GPU architectures
+   --buildfaiss     - build faiss statically into cugraph
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
 
@@ -44,11 +47,13 @@ CUGRAPH_BUILD_DIR=${REPODIR}/python/build
 BUILD_DIRS="${LIBCUGRAPH_BUILD_DIR} ${CUGRAPH_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
-VERBOSE=""
+VERBOSE_FLAG=""
 BUILD_TYPE=Release
 INSTALL_TARGET=install
 BUILD_DISABLE_DEPRECATION_WARNING=ON
-GPU_ARCH=""
+BUILD_CPP_MG_TESTS=OFF
+BUILD_STATIC_FAISS=OFF
+BUILD_ALL_GPU_ARCH=0
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if PREFIX is not set, check CONDA_PREFIX, but there is no fallback
@@ -82,7 +87,7 @@ fi
 
 # Process flags
 if hasArg -v; then
-    VERBOSE=1
+    VERBOSE_FLAG="-v"
 fi
 if hasArg -g; then
     BUILD_TYPE=Debug
@@ -91,18 +96,48 @@ if hasArg -n; then
     INSTALL_TARGET=""
 fi
 if hasArg --allgpuarch; then
-    GPU_ARCH="-DGPU_ARCHS=ALL"
+    BUILD_ALL_GPU_ARCH=1
+fi
+if hasArg --buildfaiss; then
+    BUILD_STATIC_FAISS=ON
 fi
 if hasArg --show_depr_warn; then
     BUILD_DISABLE_DEPRECATION_WARNING=OFF
 fi
+if hasArg cpp-mgtests; then
+    BUILD_CPP_MG_TESTS=ON
+fi
+
+# If clean or uninstall given, run them prior to any other steps
+if hasArg uninstall; then
+    # uninstall libcugraph
+    if [[ "$INSTALL_PREFIX" != "" ]]; then
+        rm -rf ${INSTALL_PREFIX}/include/cugraph
+        rm -f ${INSTALL_PREFIX}/lib/libcugraph.so
+    fi
+    # This may be redundant given the above, but can also be used in case
+    # there are other installed files outside of the locations above.
+    if [ -e ${LIBCUGRAPH_BUILD_DIR}/install_manifest.txt ]; then
+        xargs rm -f < ${LIBCUGRAPH_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1
+    fi
+    # uninstall cugraph installed from a prior "setup.py install"
+    pip uninstall -y cugraph
+fi
 
-# If clean given, run it prior to any other steps
 if hasArg clean; then
-    # FIXME: ideally the "setup.py clean" command below would also be run to
-    # remove all the "inplace" python build artifacts, but currently, running
-    # any setup.py command has side effects (eg. cloning repos).
-    #(cd ${REPODIR}/python && python setup.py clean)
+    # remove artifacts generated inplace
+    # FIXME: ideally the "setup.py clean" command would be used for this, but
+    # currently running any setup.py command has side effects (eg. cloning
+    # repos).
+    # (cd ${REPODIR}/python && python setup.py clean)
+    if [[ -d ${REPODIR}/python ]]; then
+        pushd ${REPODIR}/python > /dev/null
+        rm -rf dist dask-worker-space cugraph/raft *.egg-info
+        find . -name "__pycache__" -type d -exec rm -rf {} \; > /dev/null 2>&1
+        find . -name "*.cpp" -type f -delete
+        find . -name "*.cpython*.so" -type f -delete
+        popd > /dev/null
+    fi
 
     # If the dirs to clean are mounted dirs in a container, the contents should
     # be removed but the mounted dirs will remain.  The find removes all
@@ -119,18 +154,23 @@ fi
 ################################################################################
 # Configure, build, and install libcugraph
 if buildAll || hasArg libcugraph; then
-    if [[ ${GPU_ARCH} == "" ]]; then
+    if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
+        CUGRAPH_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
     else
+        CUGRAPH_CMAKE_CUDA_ARCHITECTURES="ALL"
         echo "Building for *ALL* supported GPU architectures..."
     fi
     mkdir -p ${LIBCUGRAPH_BUILD_DIR}
     cd ${LIBCUGRAPH_BUILD_DIR}
     cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
-        ${GPU_ARCH} \
-        -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${REPODIR}/cpp
-    make -j${PARALLEL_LEVEL} VERBOSE=${VERBOSE} ${INSTALL_TARGET}
+          -DCMAKE_CUDA_ARCHITECTURES=${CUGRAPH_CMAKE_CUDA_ARCHITECTURES} \
+          -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
+          -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+          -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS} \
+          -DBUILD_CUGRAPH_MG_TESTS=${BUILD_CPP_MG_TESTS} \
+          ${REPODIR}/cpp
+    cmake --build "${LIBCUGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} --target ${INSTALL_TARGET} ${VERBOSE_FLAG}
 fi
 
 # Build and install the cugraph Python package
@@ -152,10 +192,11 @@ if buildAll || hasArg docs; then
         cd ${LIBCUGRAPH_BUILD_DIR}
         cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
             -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
-            -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${REPODIR}/cpp
+            -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${REPODIR}/cpp \
+            -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS}
     fi
     cd ${LIBCUGRAPH_BUILD_DIR}
-    make -j${PARALLEL_LEVEL} VERBOSE=${VERBOSE} docs_cugraph
-    cd ${REPODIR}/docs
+    cmake --build "${LIBCUGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} --target docs_cugraph ${VERBOSE_FLAG}
+    cd ${REPODIR}/docs/cugraph
     make html
 fi
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 5f74dca4044..f8100222c12 100644
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##########################################
 # cuGraph Benchmark test script for CI   #
 ##########################################
@@ -20,18 +20,18 @@ function cleanup {
   rm -f testoutput.txt
 }
 
-# Set cleanup trap for Jenkins
-if [ ! -z "$JENKINS_HOME" ] ; then
-  gpuci_logger "Jenkins environment detected, setting cleanup trap"
-  trap cleanup EXIT
-fi
-
 # Set path, build parallel level, and CUDA version
 cd $WORKSPACE
 export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
 export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 export CUDA_REL=${CUDA_VERSION%.*}
 
+# Set cleanup trap for Jenkins
+if [ ! -z "$JENKINS_HOME" ] ; then
+  gpuci_logger "Jenkins environment detected, setting cleanup trap"
+  trap cleanup EXIT
+fi
+
 # Set home
 export HOME=$WORKSPACE
 
@@ -68,13 +68,13 @@ CUGRAPH_DEPS=(cudf rmm)
 LIBCUGRAPH_DEPS=(cudf rmm)
 
 gpuci_logger "Install required packages"
-gpuci_conda_retry install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge -c defaults \
+gpuci_conda_retry install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge \
       "cudf=${MINOR_VERSION}" \
       "rmm=${MINOR_VERSION}" \
       "cudatoolkit=$CUDA_REL" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=${MINOR_VERSION}" \
+      "ucx-py=0.21.*" \
       "ucx-proc=*=gpu" \
       "rapids-build-env=${MINOR_VERSION}" \
       rapids-pytest-benchmark
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 978ac03d85b..81388fa7b20 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ########################
 # cuGraph Style Tester #
 ########################
@@ -18,7 +18,8 @@ ERRORCODE=0
 PATH=/conda/bin:$PATH
 
 # Activate common conda env
-source activate gdf
+. /opt/conda/etc/profile.d/conda.sh
+conda activate rapids
 
 # Run flake8 and get results/return code
 FLAKE=`flake8 --config=python/.flake8 python`
@@ -52,13 +53,14 @@ COPYRIGHT=`env PYTHONPATH=ci/utils python ci/checks/copyright.py --git-modified-
 CR_RETVAL=$?
 ERRORCODE=$((ERRORCODE | ${CR_RETVAL}))
 
-# Output results if failure otherwise show pass
 if [ "$CR_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n"
   echo -e "$COPYRIGHT"
   echo -e "\n\n>>>> FAILED: copyright check; end output\n\n"
 else
-  echo -e "\n\n>>>> PASSED: copyright check\n\n"
+  echo -e "\n\n>>>> PASSED: copyright check; begin debug output\n\n"
+  echo -e "$COPYRIGHT"
+  echo -e "\n\n>>>> PASSED: copyright check; end debug output\n\n"
 fi
 
 exit ${ERRORCODE}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 2c6dc899be2..4f46938ee49 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #########################################
 # cuGraph CPU conda build script for CI #
 #########################################
@@ -24,6 +24,10 @@ fi
 export GPUCI_CONDA_RETRY_MAX=1
 export GPUCI_CONDA_RETRY_SLEEP=30
 
+# Use Ninja to build
+export CMAKE_GENERATOR="Ninja"
+export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -35,6 +39,11 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
+# Remove rapidsai-nightly channel if we are building main branch
+if [ "$SOURCE_BRANCH" = "main" ]; then
+  conda config --system --remove channels rapidsai-nightly
+fi
+
 gpuci_logger "Check versions"
 python --version
 $CC --version
@@ -55,18 +64,20 @@ conda config --set ssl_verify False
 gpuci_logger "Build conda pkg for libcugraph"
 if [ "$BUILD_LIBCUGRAPH" == '1' ]; then
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    conda build conda/recipes/libcugraph
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcugraph
   else
-    conda build --dirty --no-remove-work-dir conda/recipes/libcugraph
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libcugraph
+    mkdir -p ${CONDA_BLD_DIR}/libcugraph/work
+    cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcugraph/work
   fi
 fi
 
 gpuci_logger "Build conda pkg for cugraph"
 if [ "$BUILD_CUGRAPH" == "1" ]; then
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    conda build conda/recipes/cugraph --python=$PYTHON
+    gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph --python=$PYTHON
   else
-    conda build conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON
+    gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/.conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON
   fi
 fi
 
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index ee471329b35..9f2629d153c 100644
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,12 +18,6 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     export BUILD_LIBCUGRAPH=1
 fi
 
-if [[ "$CUDA" == "10.1" ]]; then
-    export UPLOAD_CUGRAPH=1
-else
-    export UPLOAD_CUGRAPH=0
-fi
-
 if [[ "$PYTHON" == "3.7" ]]; then
     export UPLOAD_LIBCUGRAPH=1
 else
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 0fca82216c3..11f28366c5f 100644
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh
 
@@ -29,8 +30,8 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBCUGRAPH_FILE=`conda build conda/recipes/libcugraph --output`
-export CUGRAPH_FILE=`conda build conda/recipes/cugraph --python=$PYTHON --output`
+export LIBCUGRAPH_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcugraph --output`
+export CUGRAPH_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
@@ -42,13 +43,13 @@ if [[ "$BUILD_LIBCUGRAPH" == "1" && "$UPLOAD_LIBCUGRAPH" == "1" ]]; then
   test -e ${LIBCUGRAPH_FILE}
   echo "Upload libcugraph"
   echo ${LIBCUGRAPH_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUGRAPH_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUGRAPH_FILE} --no-progress
 fi
 
-if [[ "$BUILD_CUGRAPH" == "1" && "$UPLOAD_CUGRAPH" == "1" ]]; then
+if [[ "$BUILD_CUGRAPH" == "1" ]]; then
   test -e ${CUGRAPH_FILE}
   echo "Upload cugraph"
   echo ${CUGRAPH_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUGRAPH_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUGRAPH_FILE} --no-progress
 fi
 
diff --git a/ci/docs/build.sh b/ci/docs/build.sh
index 6ce223d8b2b..2135ff04b45 100644
--- a/ci/docs/build.sh
+++ b/ci/docs/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 #################################
 # cuGraph Docs build script for CI #
 #################################
@@ -15,7 +15,6 @@ export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
 export HOME=$WORKSPACE
 export PROJECT_WORKSPACE=/rapids/cugraph
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
-export NIGHTLY_VERSION=$(echo $BRANCH_VERSION | awk -F. '{print $2}')
 export PROJECTS=(cugraph libcugraph)
 
 gpuci_logger "Check environment"
@@ -28,11 +27,6 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-# TODO: Move installs to docs-build-env meta package
-gpuci_conda_retry install -c anaconda markdown beautifulsoup4 jq
-pip install sphinx-markdown-tables
-
-
 gpuci_logger "Check versions"
 python --version
 $CC --version
@@ -47,10 +41,10 @@ conda list --show-channel-urls
 gpuci_logger "Build Doxygen docs"
 cd $PROJECT_WORKSPACE/cpp/build
 make docs_cugraph
-	
+
 # Build Python docs
 gpuci_logger "Build Sphinx docs"
-cd $PROJECT_WORKSPACE/docs
+cd $PROJECT_WORKSPACE/docs/cugraph
 make html
 
 #Commit to Website
@@ -60,10 +54,10 @@ for PROJECT in ${PROJECTS[@]}; do
     if [ ! -d "api/$PROJECT/$BRANCH_VERSION" ]; then
         mkdir -p api/$PROJECT/$BRANCH_VERSION
     fi
-    rm -rf $DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/*	
+    rm -rf $DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/*
 done
 
 
 mv $PROJECT_WORKSPACE/cpp/doxygen/html/* $DOCS_WORKSPACE/api/libcugraph/$BRANCH_VERSION
-mv $PROJECT_WORKSPACE/docs/build/html/* $DOCS_WORKSPACE/api/cugraph/$BRANCH_VERSION
+mv $PROJECT_WORKSPACE/docs/cugraph/build/html/* $DOCS_WORKSPACE/api/cugraph/$BRANCH_VERSION
 
diff --git a/ci/getGTestTimes.sh b/ci/getGTestTimes.sh
deleted file mode 100755
index 8a3752d76e2..00000000000
--- a/ci/getGTestTimes.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script will print the gtest results sorted by runtime. This will print
-# the results two ways: first by printing all tests sorted by runtime, then by
-# printing all tests grouped by test binary with tests sorted by runtime within
-# the group.
-#
-# To use this script, capture the test run output to a file then run this script
-# with the file as the first arg, or just redirect test output to this script.
-
-awk '/^Running GoogleTest .+$/ {
-       testbinary = $3
-     }
-     /^\[       OK \].+$/ {
-        testtime = substr($(NF-1),2)
-        newtestdata = testbinary ":" substr($0,14)
-        alltestdata = alltestdata newtestdata "\n"
-        testdata[testbinary] = testdata[testbinary] newtestdata "\n"
-        totaltime = totaltime + testtime
-     }
-     END {
-        # Print all tests sorted by time
-        system("echo \"" alltestdata "\" | sort -r -t\\( -nk2")
-        print "\n================================================================================"
-        # Print test binaries with tests sorted by time
-        print "Tests grouped by test binary:"
-        for (testbinary in testdata) {
-           print testbinary
-           system("echo \"" testdata[testbinary] "\" | sort -r -t\\( -nk2")
-        }
-        print "\n================================================================================"
-        print totaltime " milliseconds = " totaltime/60000 " minutes"
-     }
-' $1
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 9dd6e14181e..02e139fc05e 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##########################################
 # cuGraph GPU build & testscript for CI  #
 ##########################################
-set -e
-set -o pipefail
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
 NUMARGS=$#
 ARGS=$*
 
@@ -16,6 +16,7 @@ function hasArg {
 export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
 export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 export CUDA_REL=${CUDA_VERSION%.*}
+export CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/cugraph/cpu/.conda-bld/
 
 function cleanup {
   gpuci_logger "Removing datasets and temp files"
@@ -61,7 +62,7 @@ gpuci_conda_retry install -y \
       "cudatoolkit=$CUDA_REL" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=${MINOR_VERSION}" \
+      "ucx-py=0.21.*" \
       "ucx-proc=*=gpu" \
       "rapids-build-env=$MINOR_VERSION.*" \
       "rapids-notebook-env=$MINOR_VERSION.*" \
@@ -71,10 +72,6 @@ gpuci_conda_retry install -y \
 # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
 
-gpuci_logger "Install the master version of dask and distributed"
-pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
-
 gpuci_logger "Check versions"
 python --version
 $CC --version
@@ -90,18 +87,44 @@ conda list --show-channel-urls
 ################################################################################
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-  gpuci_logger "Build from source"
-  $WORKSPACE/build.sh -v clean libcugraph cugraph --allgpuarch
+    gpuci_logger "Build from source"
+    $WORKSPACE/build.sh -v clean libcugraph cugraph
+else
+    export LIBCUGRAPH_BUILD_DIR="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build"
+
+    # Faiss patch
+    echo "Update libcugraph.so"
+    cd $LIBCUGRAPH_BUILD_DIR
+    chrpath -d libcugraph.so
+    patchelf --replace-needed `patchelf --print-needed libcugraph.so | grep faiss` libfaiss.so libcugraph.so
+
+    CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcugraph*.tar.bz2"`
+    CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
+    CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
+    echo "Installing $CONDA_FILE"
+    conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE"
+
+    gpuci_logger "Install the master version of dask and distributed"
+    pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
+
+    echo "Build cugraph..."
+    $WORKSPACE/build.sh cugraph
 fi
 
 ################################################################################
 # TEST - Run GoogleTest and py.tests for libcugraph and cuGraph
 ################################################################################
 
-set +e -Eo pipefail
-EXITCODE=0
+# Switch to +e to allow failing commands to continue the script, which is needed
+# so all testing commands run regardless of pass/fail. This requires the desired
+# exit code to be managed using the ERR trap.
+set +e           # allow script to continue on error
+set -E           # ERR traps are inherited by subcommands
 trap "EXITCODE=1" ERR
 
+EXITCODE=0
+
 if hasArg --skip-tests; then
     gpuci_logger "Skipping Tests"
 else
@@ -117,18 +140,19 @@ else
         TEST_MODE_FLAG=""
     fi
 
+    gpuci_logger "Running cuGraph test.sh..."
     ${WORKSPACE}/ci/test.sh ${TEST_MODE_FLAG} | tee testoutput.txt
+    gpuci_logger "Ran cuGraph test.sh : return code was: $?, gpu/build.sh exit code is now: $EXITCODE"
 
-    echo -e "\nTOP 20 SLOWEST TESTS:\n"
-    # Wrap in echo to prevent non-zero exit since this command is non-essential
-    echo "$(${WORKSPACE}/ci/getGTestTimes.sh testoutput.txt | head -20)"
-
+    gpuci_logger "Running cuGraph notebook test script..."
     ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
+    gpuci_logger "Ran cuGraph notebook test script : return code was: $?, gpu/build.sh exit code is now: $EXITCODE"
     python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log
 fi
 
-if [ -n "\${CODECOV_TOKEN}" ]; then
-    codecov -t \$CODECOV_TOKEN
+if [ -n "${CODECOV_TOKEN}" ]; then
+    codecov -t $CODECOV_TOKEN
 fi
 
+gpuci_logger "gpu/build.sh returning value: $EXITCODE"
 return ${EXITCODE}
diff --git a/ci/gpu/notebook_list.py b/ci/gpu/notebook_list.py
new file mode 100644
index 00000000000..23a198830a8
--- /dev/null
+++ b/ci/gpu/notebook_list.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+import glob
+
+from numba import cuda
+
+cuda_version_string = ".".join([str(n) for n in cuda.runtime.get_version()])
+#
+# Not strictly true... however what we mean is
+# Pascal or earlier
+#
+pascal = False
+device = cuda.get_current_device()
+# check for the attribute using both pre and post numba 0.53 names
+cc = getattr(device, 'COMPUTE_CAPABILITY', None) or \
+     getattr(device, 'compute_capability')
+if (cc[0] < 7):
+    pascal = True
+
+for filename in glob.iglob('**/*.ipynb', recursive=True):
+    skip = False
+    for line in open(filename, 'r'):
+        if re.search('# Skip notebook test', line):
+            skip = True
+            print(f'SKIPPING {filename} (marked as skip)', file=sys.stderr)
+            break;
+        elif re.search('dask', line):
+            print(f'SKIPPING {filename} (suspected Dask usage, not currently automatable)', file=sys.stderr)
+            skip = True
+            break;
+        elif pascal and re.search('# Does not run on Pascal', line):
+            print(f'SKIPPING {filename} (does not run on Pascal)', file=sys.stderr)
+            skip = True
+            break;
+        elif re.search('# Does not run on CUDA ', line) and \
+             (cuda_version_string in line):
+            print(f'SKIPPING {filename} (does not run on CUDA {cuda_version_string})',
+                  file=sys.stderr)
+            skip = True
+            break;
+
+    if not skip:
+        print(filename)
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index 389d3be0bfd..650132f116d 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,23 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#RAPIDS_DIR=/rapids
+# Any failing command will set EXITCODE to non-zero
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
+
 NOTEBOOKS_DIR=${WORKSPACE}/notebooks
 NBTEST=${WORKSPACE}/ci/utils/nbtest.sh
 LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache
+EXITCODE=0
 
 cd ${NOTEBOOKS_DIR}
 TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
 
-# Add notebooks that should be skipped here
-# (space-separated list of filenames without paths)
-
-SKIPNBS="uvm.ipynb bfs_benchmark.ipynb louvain_benchmark.ipynb pagerank_benchmark.ipynb sssp_benchmark.ipynb release.ipynb nx_cugraph_bc_benchmarking.ipynb"
-
 ## Check env
 env
 
-EXITCODE=0
+# Do not abort the script on error. This allows all tests to run regardless of
+# pass/fail but relies on the ERR trap above to manage the EXITCODE for the
+# script.
+set +e
 
 # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure
 # if any run fails
@@ -37,29 +41,20 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do
     echo "FOLDER: ${folder}"
     echo "========================================"
     cd ${NOTEBOOKS_DIR}/${folder}
-    for nb in $(find . -name "*.ipynb"); do
+    NBLIST=$(python ${WORKSPACE}/ci/gpu/notebook_list.py)
+    for nb in ${NBLIST}; do
         nbBasename=$(basename ${nb})
-        # Skip all NBs that use dask (in the code or even in their name)
-        if ((echo ${nb}|grep -qi dask) || \
-            (grep -q dask ${nb})); then
-            echo "--------------------------------------------------------------------------------"
-            echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)"
-            echo "--------------------------------------------------------------------------------"
-        elif (echo " ${SKIPNBS} " | grep -q " ${nbBasename} "); then
-            echo "--------------------------------------------------------------------------------"
-            echo "SKIPPING: ${nb} (listed in skip list)"
-            echo "--------------------------------------------------------------------------------"
-        else
-            cd $(dirname ${nb})
-            nvidia-smi
-            ${NBTEST} ${nbBasename}
-            EXITCODE=$((EXITCODE | $?))
-            rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
-            cd ${NOTEBOOKS_DIR}/${folder}
-        fi
+        cd $(dirname ${nb})
+        nvidia-smi
+        ${NBTEST} ${nbBasename}
+        echo "Ran nbtest for $nb : return code was: $?, test script exit code is now: $EXITCODE"
+        echo
+        rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
+        cd ${NOTEBOOKS_DIR}/${folder}
     done
 done
 
 nvidia-smi
 
+echo "Notebook test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index d853c3693c6..2ff14c6c6e9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,42 +13,25 @@
 # limitations under the License.
 
 ## Usage
-# bash update-version.sh <type>
-#     where <type> is either `major`, `minor`, `patch`
+# bash update-version.sh <new_version>
 
-set -e
 
-# Grab argument for release type
-RELEASE_TYPE=$1
+# Format is YY.MM.PP - no leading 'v' or trailing 'a'
+NEXT_FULL_TAG=$1
 
-# Get current version and calculate next versions
-CURRENT_TAG=`git tag | grep -xE 'v[0-9\.]+' | sort --version-sort | tail -n 1 | tr -d 'v'`
-CURRENT_MAJOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}'`
-CURRENT_MINOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}'`
-CURRENT_PATCH=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}'`
+# Get current version
+CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v')
+CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}')
+CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
+CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
-NEXT_MAJOR=$((CURRENT_MAJOR + 1))
-NEXT_MINOR=$((CURRENT_MINOR + 1))
-NEXT_PATCH=$((CURRENT_PATCH + 1))
-NEXT_FULL_TAG=""
-NEXT_SHORT_TAG=""
 
-# Determine release type
-if [ "$RELEASE_TYPE" == "major" ]; then
-  NEXT_FULL_TAG="${NEXT_MAJOR}.0.0"
-  NEXT_SHORT_TAG="${NEXT_MAJOR}.0"
-elif [ "$RELEASE_TYPE" == "minor" ]; then
-  NEXT_FULL_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}.0"
-  NEXT_SHORT_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}"
-elif [ "$RELEASE_TYPE" == "patch" ]; then
-  NEXT_FULL_TAG="${CURRENT_MAJOR}.${CURRENT_MINOR}.${NEXT_PATCH}"
-  NEXT_SHORT_TAG="${CURRENT_MAJOR}.${CURRENT_MINOR}"
-else
-  echo "Incorrect release type; use 'major', 'minor', or 'patch' as an argument"
-  exit 1
-fi
+#Get <major>.<minor> for next version
+NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
+NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
+NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 
-echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]"
+echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
 # Inplace sed replace; workaround for Linux and Mac
 function sed_runner() {
@@ -59,13 +42,13 @@ function sed_runner() {
 sed_runner 's/'"CUGRAPH VERSION .* LANGUAGES C CXX CUDA)"'/'"CUGRAPH VERSION ${NEXT_FULL_TAG} LANGUAGES C CXX CUDA)"'/g' cpp/CMakeLists.txt
 
 # RTD update
-sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
-sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
+sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cugraph/source/conf.py
+sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cugraph/source/conf.py
 
 for FILE in conda/environments/*.yml; do
    sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/ucx-py=${CURRENT_SHORT_TAG}/ucx-py=${NEXT_SHORT_TAG}/g" ${FILE};
+   sed_runner "s/cuxfilter=${CURRENT_SHORT_TAG}/cuxfilter=${NEXT_SHORT_TAG}/g" ${FILE};
 done
diff --git a/ci/test.sh b/ci/test.sh
index db9390461c0..31660cd15ec 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# note: do not use set -e in order to allow all gtest invocations to take place,
-# and instead keep track of exit status and exit with an overall exit status
-set -o pipefail
+# Any failing command will set EXITCODE to non-zero
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
 
 NUMARGS=$#
 ARGS=$*
@@ -22,7 +24,7 @@ THISDIR=$(cd $(dirname $0);pwd)
 CUGRAPH_ROOT=$(cd ${THISDIR}/..;pwd)
 GTEST_ARGS="--gtest_output=xml:${CUGRAPH_ROOT}/test-results/"
 DOWNLOAD_MODE=""
-ERRORCODE=0
+EXITCODE=0
 
 export RAPIDS_DATASET_ROOT_DIR=${CUGRAPH_ROOT}/datasets
 
@@ -50,47 +52,41 @@ else
     echo "Download datasets..."
     cd ${RAPIDS_DATASET_ROOT_DIR}
     bash ./get_test_data.sh ${DOWNLOAD_MODE}
-    ERRORCODE=$((ERRORCODE | $?))
-    # no need to run tests if dataset download fails
-    if (( ${ERRORCODE} != 0 )); then
-        exit ${ERRORCODE}
-    fi
 fi
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     cd ${CUGRAPH_ROOT}/cpp/build
 else
-    export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
     cd $WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build
 fi
 
-for gt in gtests/*; do
-    test_name=$(basename $gt)
-    echo "Running GoogleTest $test_name"
-    ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
-    ERRORCODE=$((ERRORCODE | $?))
-done
+# Do not abort the script on error from this point on. This allows all tests to
+# run regardless of pass/fail, but relies on the ERR trap above to manage the
+# EXITCODE for the script.
+set +e
 
-if [[ "$PROJECT_FLASH" == "1" ]]; then
-    CONDA_FILE=`find $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ -name "libcugraph*.tar.bz2"`
-    CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
-    CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
-    echo "Installing $CONDA_FILE"
-    conda install -c $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ "$CONDA_FILE"
-
-    export LIBCUGRAPH_BUILD_DIR="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build"
-    echo "Build cugraph..."
-    $WORKSPACE/build.sh cugraph
+if (python ${CUGRAPH_ROOT}/ci/utils/is_pascal.py); then
+    echo "WARNING: skipping C++ tests on Pascal GPU arch."
+else
+    echo "C++ gtests for cuGraph..."
+    for gt in tests/*_TEST; do
+        test_name=$(basename $gt)
+        echo "Running gtest $test_name"
+        ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
+        echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE"
+    done
 fi
 
 echo "Python pytest for cuGraph..."
 cd ${CUGRAPH_ROOT}/python
 pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=cugraph/raft --benchmark-disable
-ERRORCODE=$((ERRORCODE | $?))
+echo "Ran Python pytest for cugraph : return code was: $?, test script exit code is now: $EXITCODE"
 
 echo "Python benchmarks for cuGraph (running as tests)..."
 cd ${CUGRAPH_ROOT}/benchmarks
 pytest -v -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable
-ERRORCODE=$((ERRORCODE | $?))
+echo "Ran Python benchmarks for cuGraph (running as tests) : return code was: $?, test script exit code is now: $EXITCODE"
 
-exit ${ERRORCODE}
+echo "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/utils/git_helpers.py b/ci/utils/git_helpers.py
index 83ad73fe283..a0c413b75f4 100644
--- a/ci/utils/git_helpers.py
+++ b/ci/utils/git_helpers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -59,14 +59,24 @@ def uncommittedFiles():
     return ret
 
 
-def changedFilesBetween(b1, b2):
-    """Returns a list of files changed between branches b1 and b2"""
+def changedFilesBetween(baseName, branchName, commitHash):
+    """
+    Returns a list of files changed between branches baseName and latest commit
+    of branchName.
+    """
     current = branch()
-    __git("checkout", "--quiet", b1)
-    __git("checkout", "--quiet", b2)
-    files = __gitdiff("--name-only", "--ignore-submodules", "%s...%s" %
-                      (b1, b2))
-    __git("checkout", "--quiet", current)
+    # checkout "base" branch
+    __git("checkout", "--force", baseName)
+    # checkout branch for comparing
+    __git("checkout", "--force", branchName)
+    # checkout latest commit from branch
+    __git("checkout", "-fq", commitHash)
+
+    files = __gitdiff("--name-only", "--ignore-submodules",
+                      f"{baseName}..{branchName}")
+
+    # restore the original branch
+    __git("checkout", "--force", current)
     return files.splitlines()
 
 
@@ -87,10 +97,10 @@ def changesInFileBetween(file, b1, b2, pathFilter=None):
 
 def modifiedFiles(pathFilter=None):
     """
-    If inside a CI-env (ie. currentBranch=current-pr-branch and the env-var
-    PR_TARGET_BRANCH is defined), then lists out all files modified between
-    these 2 branches. Else, lists out all the uncommitted files in the current
-    branch.
+    If inside a CI-env (ie. TARGET_BRANCH and COMMIT_HASH are defined, and
+    current branch is "current-pr-branch"), then lists out all files modified
+    between these 2 branches. Else, lists out all the uncommitted files in the
+    current branch.
 
     Such utility function is helpful while putting checker scripts as part of
     cmake, as well as CI process. This way, during development, only the files
@@ -98,15 +108,26 @@ def modifiedFiles(pathFilter=None):
     process ALL files modified by the dev, as submiited in the PR, will be
     checked. This happens, all the while using the same script.
     """
-    if "PR_TARGET_BRANCH" in os.environ and branch() == "current-pr-branch":
-        allFiles = changedFilesBetween(os.environ["PR_TARGET_BRANCH"],
-                                       branch())
+    targetBranch = os.environ.get("TARGET_BRANCH")
+    commitHash = os.environ.get("COMMIT_HASH")
+    currentBranch = branch()
+    print(f"   [DEBUG] TARGET_BRANCH={targetBranch}, COMMIT_HASH={commitHash}, "
+          f"currentBranch={currentBranch}")
+
+    if targetBranch and commitHash and (currentBranch == "current-pr-branch"):
+        print("   [DEBUG] Assuming a CI environment.")
+        allFiles = changedFilesBetween(targetBranch, currentBranch, commitHash)
     else:
+        print("   [DEBUG] Did not detect CI environment.")
         allFiles = uncommittedFiles()
+
     files = []
     for f in allFiles:
         if pathFilter is None or pathFilter(f):
             files.append(f)
+
+    filesToCheckString = "\n\t".join(files) if files else "<None>"
+    print(f"   [DEBUG] Found files to check:\n\t{filesToCheckString}\n")
     return files
 
 
diff --git a/ci/utils/is_pascal.py b/ci/utils/is_pascal.py
new file mode 100644
index 00000000000..e55a3153a12
--- /dev/null
+++ b/ci/utils/is_pascal.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+import glob
+
+from numba import cuda
+
+# FIXME: consolidate this code with ci/gpu/notebook_list.py
+
+#
+# Not strictly true... however what we mean is
+# Pascal or earlier
+#
+pascal = False
+
+device = cuda.get_current_device()
+# check for the attribute using both pre and post numba 0.53 names
+cc = getattr(device, 'COMPUTE_CAPABILITY', None) or \
+     getattr(device, 'compute_capability')
+if (cc[0] < 7):
+    pascal = True
+
+# Return zero (success) if pascal is True
+if pascal:
+    sys.exit(0)
+else:
+    sys.exit(1)
diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh
index 8c86baeaa09..ae8b52df106 100755
--- a/ci/utils/nbtest.sh
+++ b/ci/utils/nbtest.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Any failing command will set EXITCODE to non-zero
+set +e           # do not abort the script on error
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
+
+# Prepend the following code to all scripts generated from nbconvert.  This
+# allows all cell and line magic code to run and update the namespace as if
+# running in jupyter, but will also tolerate failures due to running in a
+# non-jupyter env.
+# Note: depending on the assumptions of the notebook script, ignoring failures
+# may not be acceptable (meaning the converted notebook simply cannot run
+# outside of jupyter as-is), hence the warning.
 MAGIC_OVERRIDE_CODE="
 def my_run_line_magic(*args, **kwargs):
     g=globals()
@@ -58,7 +71,6 @@ for nb in $*; do
     NBEXITCODE=$?
     echo EXIT CODE: ${NBEXITCODE}
     echo
-    EXITCODE=$((EXITCODE | ${NBEXITCODE}))
 done
 
 exit ${EXITCODE}
diff --git a/conda/environments/builddocs.yml b/conda/environments/builddocs.yml
deleted file mode 100644
index 89bd44a5542..00000000000
--- a/conda/environments/builddocs.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: builddocs
-channels:
-- rapidsai
-- pytorch
-- conda-forge
-- numba
-- defaults
-dependencies:
-- python=3.6*
-- cugraph=0.8*
-- cudatoolkit=9.2
-- cudf=0.8*
-- pyarrow=0.12.1.*
-- cython=0.29*
-- pip:
-  - numpydoc
-  - sphinx
-  - sphinx-rtd-theme
-  - sphinxcontrib-websupport
diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml
deleted file mode 100644
index 6526dd73f98..00000000000
--- a/conda/environments/cugraph_dev_cuda10.2.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-name: cugraph_dev
-channels:
-- rapidsai
-- nvidia
-- rapidsai-nightly
-- conda-forge
-dependencies:
-- cudf=0.17.*
-- libcudf=0.17.*
-- rmm=0.17.*
-- cuxfilter=0.17.*
-- librmm=0.17.*
-- dask>=2.12.0
-- distributed>=2.12.0
-- dask-cuda=0.17*
-- dask-cudf=0.17*
-- nccl>=2.7
-- ucx-py=0.17*
-- ucx-proc=*=gpu
-- scipy
-- networkx
-- python-louvain
-- cudatoolkit=10.2
-- clang=8.0.1
-- clang-tools=8.0.1
-- cmake>=3.12
-- python>=3.6,<3.9
-- notebook>=0.5.0
-- boost
-- cython>=0.29,<0.30
-- pytest
-- scikit-learn>=0.23.1
-- colorcet
-- holoviews
-- datashader
-- sphinx
-- sphinx_rtd_theme
-- sphinxcontrib-websupport
-- sphinx-markdown-tables
-- nbsphinx
-- numpydoc
-- ipython
-- recommonmark
-- pip
-- libcypher-parser
-- rapids-pytest-benchmark
-- doxygen
diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml
index 5016eb9405c..d19ac1bd00e 100644
--- a/conda/environments/cugraph_dev_cuda11.0.yml
+++ b/conda/environments/cugraph_dev_cuda11.0.yml
@@ -5,43 +5,43 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.17.*
-- libcudf=0.17.*
-- rmm=0.17.*
-- cuxfilter=0.17.*
-- librmm=0.17.*
-- dask>=2.12.0
-- distributed>=2.12.0
-- dask-cuda=0.17*
-- dask-cudf=0.17*
-- nccl>=2.7
-- ucx-py=0.17*
+- cudatoolkit=11.0
+- cudf=21.08.*
+- libcudf=21.08.*
+- rmm=21.08.*
+- librmm=21.08.*
+- dask>=2021.6.0,<=2021.07.1
+- distributed>=2021.6.0,<=2021.07.1
+- dask-cuda=21.08.*
+- dask-cudf=21.08.*
+- nccl>=2.9.9
+- ucx-py=0.21.*
 - ucx-proc=*=gpu
 - scipy
-- networkx
-- python-louvain
-- cudatoolkit=11.0
-- clang=8.0.1
-- clang-tools=8.0.1
-- cmake>=3.12
+- networkx>=2.5.1
+- clang=11.0.0
+- clang-tools=11.0.0
+- cmake>=3.20.1
 - python>=3.6,<3.9
 - notebook>=0.5.0
 - boost
 - cython>=0.29,<0.30
 - pytest
+- libfaiss=1.7.0
+- faiss-proc=*=cuda
 - scikit-learn>=0.23.1
-- colorcet
-- datashader
-- holoviews
 - sphinx
 - sphinx_rtd_theme
 - sphinxcontrib-websupport
 - sphinx-markdown-tables
+- sphinx-copybutton
 - nbsphinx
 - numpydoc
 - ipython
 - recommonmark
 - pip
-- libcypher-parser
 - rapids-pytest-benchmark
 - doxygen
+- pytest-cov
+- gtest
+- gmock
diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda11.2.yml
similarity index 50%
rename from conda/environments/cugraph_dev_cuda10.1.yml
rename to conda/environments/cugraph_dev_cuda11.2.yml
index 9b4274abef5..7d2f3d26ef5 100644
--- a/conda/environments/cugraph_dev_cuda10.1.yml
+++ b/conda/environments/cugraph_dev_cuda11.2.yml
@@ -5,43 +5,43 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.17.*
-- libcudf=0.17.*
-- rmm=0.17.*
-- cuxfilter=0.17.*
-- librmm=0.17.*
-- dask>=2.12.0
-- distributed>=2.12.0
-- dask-cuda=0.17*
-- dask-cudf=0.17*
-- nccl>=2.7
-- ucx-py=0.17*
+- cudatoolkit=11.2
+- cudf=21.08.*
+- libcudf=21.08.*
+- rmm=21.08.*
+- librmm=21.08.*
+- dask>=2021.6.0,<=2021.07.1
+- distributed>=2021.6.0,<=2021.07.1
+- dask-cuda=21.08.*
+- dask-cudf=21.08.*
+- nccl>=2.9.9
+- ucx-py=0.21.*
 - ucx-proc=*=gpu
 - scipy
-- networkx
-- python-louvain
-- cudatoolkit=10.1
-- clang=8.0.1
-- clang-tools=8.0.1
-- cmake>=3.12
+- networkx>=2.5.1
+- clang=11.0.0
+- clang-tools=11.0.0
+- cmake>=3.20.1
 - python>=3.6,<3.9
 - notebook>=0.5.0
 - boost
 - cython>=0.29,<0.30
 - pytest
+- libfaiss=1.7.0
+- faiss-proc=*=cuda
 - scikit-learn>=0.23.1
-- colorcet
-- holoviews
-- datashader
 - sphinx
 - sphinx_rtd_theme
 - sphinxcontrib-websupport
 - sphinx-markdown-tables
+- sphinx-copybutton
 - nbsphinx
 - numpydoc
 - ipython
 - recommonmark
 - pip
-- libcypher-parser
 - rapids-pytest-benchmark
 - doxygen
+- pytest-cov
+- gtest
+- gmock
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index 90f5bed942a..b335ec7753e 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -1,20 +1,21 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 # Usage:
-#   conda build -c nvidia -c rapidsai -c conda-forge -c defaults .
+#   conda build -c nvidia -c rapidsai -c conda-forge  .
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
 package:
   name: cugraph
   version: {{ version }}
 
 source:
-  path: ../../..
+  git_url: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_version }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - CC
     - CXX
@@ -27,27 +28,23 @@ requirements:
     - cython>=0.29,<0.30
     - libcugraph={{ version }}
     - cudf={{ minor_version }}
-    - ucx-py {{ minor_version }}
+    - ucx-py 0.21
     - ucx-proc=*=gpu
+    - cudatoolkit {{ cuda_version }}.*
   run:
     - python x.x
     - libcugraph={{ version }}
     - cudf={{ minor_version }}
     - dask-cudf {{ minor_version }}
     - dask-cuda {{ minor_version }}
-    - dask>=2.12.0
-    - distributed>=2.12.0
-    - nccl>=2.7
-    - ucx-py {{ minor_version }}
+    - dask>=2021.6.0,<=2021.07.1
+    - distributed>=2021.6.0,<=2021.07.1
+    - ucx-py 0.21
     - ucx-proc=*=gpu
-
-#test:
-#  commands:
-#    - test -f $PREFIX/include/cugraph.h
-
+    - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
 
 about:
   home: http://rapids.ai/
   license: Apache-2.0
   license_file: ../../../LICENSE
-  summary: libcugraph library
+  summary: cuGraph library
diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml
index 211ec920d27..570a0ec09b2 100644
--- a/conda/recipes/libcugraph/meta.yaml
+++ b/conda/recipes/libcugraph/meta.yaml
@@ -1,7 +1,7 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 # Usage:
-#   conda build -c nvidia -c rapidsai -c conda-forge -c defaults .
+#   conda build -c nvidia -c rapidsai -c conda-forge .
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set cuda_version='.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %}
@@ -21,28 +21,32 @@ build:
     - CUDAHOSTCXX
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
+    - CCACHE_DIR
+    - CCACHE_NOHASHDIR
+    - CCACHE_COMPILERCHECK
+    - CMAKE_GENERATOR
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
 
 requirements:
   build:
-    - cmake>=3.12.4
-    - libcudf={{ minor_version }}
+    - cmake>=3.20.1
     - cudatoolkit {{ cuda_version }}.*
+    - librmm {{ minor_version }}.*
     - boost-cpp>=1.66
-    - libcypher-parser
-    - nccl>=2.7
-    - ucx-py {{ minor_version }}
+    - nccl>=2.9.9
     - ucx-proc=*=gpu
+    - gtest
+    - gmock
+    - faiss-proc=*=cuda
+    - libfaiss 1.7.0 *_cuda
   run:
-    - libcudf={{ minor_version }}
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
-    - nccl>=2.7
-    - ucx-py {{ minor_version }}
+    - nccl>=2.9.9
     - ucx-proc=*=gpu
-
-#test:
-#  commands:
-#    - test -f $PREFIX/include/cugraph.h
-
+    - faiss-proc=*=cuda
+    - libfaiss 1.7.0 *_cuda
 
 about:
   home: http://rapids.ai/
diff --git a/conda_build.sh b/conda_build.sh
index 4643e302f5c..1254b7d8d5a 100755
--- a/conda_build.sh
+++ b/conda_build.sh
@@ -1,13 +1,15 @@
 #!/usr/bin/env bash
+# Copyright (c) 2021, NVIDIA CORPORATION
+
 set -xe
 
 CUDA_REL=${CUDA_VERSION%.*}
 
 conda install conda-build anaconda-client conda-verify -y
-conda build -c nvidia -c rapidsai -c rapidsai-nightly/label/cuda${CUDA_REL} -c conda-forge -c defaults --python=${PYTHON} conda/recipes/cugraph
+conda build -c nvidia -c rapidsai -c rapidsai-nightly/label/cuda${CUDA_REL} -c conda-forge --python=${PYTHON} conda/recipes/cugraph
 
 if [ "$UPLOAD_PACKAGE" == '1' ]; then
-    export UPLOADFILE=`conda build -c nvidia -c rapidsai -c conda-forge -c defaults --python=${PYTHON} conda/recipes/cugraph --output`
+    export UPLOADFILE=`conda build -c nvidia -c rapidsai -c conda-forge --python=${PYTHON} conda/recipes/cugraph --output`
     SOURCE_BRANCH=main
 
     test -e ${UPLOADFILE}
@@ -26,7 +28,7 @@ if [ "$UPLOAD_PACKAGE" == '1' ]; then
 
     echo "Upload"
     echo ${UPLOADFILE}
-    anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE}
+    anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE} --no-progress
 else
     echo "Skipping upload"
 fi
diff --git a/cpp/.clang-format b/cpp/.clang-format
index 6f48df58b74..0c05436e922 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -6,16 +6,22 @@ Language: Cpp
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: true
+AlignConsecutiveBitFields: true
 AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
 AlignEscapedNewlines: Left
 AlignOperands: true
 AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: true 
 AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
+AllowShortLambdasOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
 # This is deprecated
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
@@ -40,14 +46,14 @@ BraceWrapping:
   SplitEmptyFunction: false
   SplitEmptyRecord: false
   SplitEmptyNamespace: false
+BreakAfterJavaFieldAnnotations: false
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: WebKit
 BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
+BreakInheritanceList: BeforeColon
 BreakStringLiterals: true
 ColumnLimit: 100
 CommentPragmas: '^ IWYU pragma:'
@@ -57,7 +63,7 @@ ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 2
 ContinuationIndentWidth: 2
 Cpp11BracedListStyle: true
-DerivePointerAlignment: true
+DerivePointerAlignment: false
 DisableFormat: false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
@@ -139,18 +145,20 @@ SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles: false
+SpacesInConditionalStatement: false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-Standard: Cpp11
+Standard: c++17
 StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
 # Be consistent with indent-width, even for people who use tab for indentation!
 TabWidth: 2
 UseTab: Never
-
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8504f0c7b87..9fcb40b1eba 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,401 +14,265 @@
 # limitations under the License.
 #=============================================================================
 
-cmake_minimum_required(VERSION 3.12..3.17 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
+include(FetchContent)
+FetchContent_Declare(
+  rapids-cmake
+  GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git
+  GIT_TAG        origin/branch-21.08
+  )
+FetchContent_MakeAvailable(rapids-cmake)
 
-project(CUGRAPH VERSION 0.17.0 LANGUAGES C CXX CUDA)
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
 
-###################################################################################################
-# - build type ------------------------------------------------------------------------------------
+rapids_cuda_init_architectures(CUGRAPH)
 
-# Set a default build type if none was specified
-set(DEFAULT_BUILD_TYPE "Release")
-
-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.")
-  set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE
-      STRING "Choose the type of build." FORCE)
-  # Set the possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
+project(CUGRAPH VERSION 21.08.00 LANGUAGES C CXX CUDA)
 
-###################################################################################################
-# - compiler options ------------------------------------------------------------------------------
+# Remove the following archs from CMAKE_CUDA_ARCHITECTURES that
+# cuhornet currently doesn't support
+#
+# < 60
+# >= 86
+set(supported_archs "60" "62" "70" "72" "75" "80")
+foreach( arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    string(REPLACE "-real" "" arch ${arch})
+    if( arch IN_LIST supported_archs )
+        list(APPEND usable_arch_values ${arch})
+    endif()
+endforeach()
+# Make sure everything but the 'newest' arch
+# is marked as `-real` so we only generate PTX for
+# arch > 80
+list(POP_BACK usable_arch_values latest_arch)
+list(TRANSFORM usable_arch_values APPEND "-real")
+list(APPEND usable_arch_values ${latest_arch})
 
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_C_COMPILER $ENV{CC})
-set(CMAKE_CXX_COMPILER $ENV{CXX})
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_ARCHITECTURES ${usable_arch_values})
 
-set(CMAKE_CUDA_STANDARD 14)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
-if(CMAKE_COMPILER_IS_GNUCXX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations")
-endif(CMAKE_COMPILER_IS_GNUCXX)
+# Write the version header
+include(cmake/Modules/Version.cmake)
+write_version()
 
-find_package(CUDA)
+###################################################################################################
+# - build type ------------------------------------------------------------------------------------
 
-# Configure GPU arch to build
-set(GUNROCK_GENCODE_SM60 "OFF")
-set(GUNROCK_GENCODE_SM61 "OFF")
-set(GUNROCK_GENCODE_SM70 "OFF")
-set(GUNROCK_GENCODE_SM72 "OFF")
-set(GUNROCK_GENCODE_SM75 "OFF")
-set(GUNROCK_GENCODE_SM80 "OFF")
+# Set a default build type if none was specified
+rapids_cmake_build_type(Release)
 
-# ARCHS handling:
-#
-if("${GPU_ARCHS}" STREQUAL "")
-  include(cmake/EvalGpuArchs.cmake)
-  evaluate_gpu_archs(GPU_ARCHS)
-endif()
+##############################################################################
+# - User Options  ------------------------------------------------------------
 
+option(BUILD_CUGRAPH_MG_TESTS "Build cuGraph multigpu algorithm tests" OFF)
+set(BLAS_LIBRARIES "" CACHE STRING "Location of BLAS library for FAISS build.")
+option(BUILD_STATIC_FAISS "Build the FAISS library for nearest neighbors search on GPU" OFF)
+option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
+option(BUILD_TESTS "Configure CMake to build tests" ON)
 
-# CUDA 11 onwards cub ships with CTK
-if((CUDA_VERSION_MAJOR EQUAL 11) OR (CUDA_VERSION_MAJOR GREATER 11))
-  set(CUB_IS_PART_OF_CTK ON)
-else()
-  set(CUB_IS_PART_OF_CTK OFF)
-endif()
+###################################################################################################
+# - compiler options ------------------------------------------------------------------------------
 
-if("${GPU_ARCHS}" STREQUAL "ALL")
-  set(GPU_ARCHS "60")
-  if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
-    set(GPU_ARCHS "${GPU_ARCHS};70")
-  endif()
-  if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10))
-    set(GPU_ARCHS "${GPU_ARCHS};75")
-  endif()
-  if((CUDA_VERSION_MAJOR EQUAL 11) OR (CUDA_VERSION_MAJOR GREATER 11))
-    set(GPU_ARCHS "${GPU_ARCHS};80")
-  endif()
-endif()
+rapids_find_package(CUDAToolkit REQUIRED
+    BUILD_EXPORT_SET    cugraph-exports
+    INSTALL_EXPORT_SET  cugraph-exports
+    )
 
-message("-- Building for GPU_ARCHS = ${GPU_ARCHS}")
-foreach(arch ${GPU_ARCHS})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}")
-  set(GUNROCK_GENCODE_SM${arch} "ON")
-endforeach()
+set(CUGRAPH_CXX_FLAGS "")
+set(CUGRAPH_CUDA_FLAGS "")
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    list(APPEND CUGRAPH_CXX_FLAGS -Werror -Wno-error=deprecated-declarations)
+endif(CMAKE_COMPILER_IS_GNUCXX)
 
-list(GET GPU_ARCHS -1 ptx)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}")
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas --disable-warnings")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable")
+message("-- Building for GPU_ARCHS = ${CMAKE_CUDA_ARCHITECTURES}")
 
+list(APPEND CUGRAPH_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+list(APPEND CUGRAPH_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas=--disable-warnings)
+list(APPEND CUGRAPH_CUDA_FLAGS -Xcompiler=-Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable)
+list(APPEND CUGRAPH_CUDA_FLAGS -Xfatbin=-compress-all)
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking
-option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
 if (CMAKE_CUDA_LINEINFO)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
+    list(APPEND CUGRAPH_CUDA_FLAGS -lineinfo)
 endif(CMAKE_CUDA_LINEINFO)
 
 # Debug options
 if(CMAKE_BUILD_TYPE MATCHES Debug)
     message(STATUS "Building with debugging flags")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G -Xcompiler -rdynamic")
+    list(APPEND CUGRAPH_CUDA_FLAGS -G -Xcompiler=-rdynamic)
 endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
-# To apply RUNPATH to transitive dependencies (this is a temporary solution)
-set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--disable-new-dtags")
-set(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")
-
-option(BUILD_TESTS "Configure CMake to build tests"
-       ON)
-
-###################################################################################################
-# - cmake modules ---------------------------------------------------------------------------------
-
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH})
-
-include(FeatureSummary)
-include(CheckIncludeFiles)
-include(CheckLibraryExists)
-if(BUILD_TESTS)
-    include(CTest)
-endif(BUILD_TESTS)
-
-###################################################################################################
-# - find boost ------------------------------------------------------------------------------------
-
-find_package(Boost REQUIRED)
-if(Boost_FOUND)
-    message(STATUS "Boost found in ${Boost_INCLUDE_DIRS}")
-else()
-    message(FATAL_ERROR "Boost not found, please check your settings.")
-endif(Boost_FOUND)
-
 ###################################################################################################
 # - find openmp -----------------------------------------------------------------------------------
 
 find_package(OpenMP)
 if(OpenMP_FOUND)
     # find_package(OPenMP) does not automatically add OpenMP flags to CUDA
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}")
+    list(APPEND CUGRAPH_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
 endif(OpenMP_FOUND)
 
 
 ###################################################################################################
-# - find gtest ------------------------------------------------------------------------------------
+# - find blas -------------------------------------------------------------------------------------
 
-if(BUILD_TESTS)
-    include(ConfigureGoogleTest)
-
-    if(GTEST_FOUND)
-        message(STATUS
-            "Google C++ Testing Framework (Google Test) found in ${GTEST_ROOT}")
-    else()
-        message(AUTHOR_WARNING
-            "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
-    endif(GTEST_FOUND)
-endif(BUILD_TESTS)
-
-###################################################################################################
-# - RMM -------------------------------------------------------------------------------------------
-
-find_path(RMM_INCLUDE "rmm"
-    HINTS
-    "$ENV{RMM_ROOT}/include"
-    "$ENV{CONDA_PREFIX}/include/rmm"
-    "$ENV{CONDA_PREFIX}/include")
-
-message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
-
-###################################################################################################
-# - Fetch Content ---------------------------------------------------------------------------------
-include(FetchContent)
-
-# - THRUST/CUB
-message("Fetching Thrust")
-
-FetchContent_Declare(
-    thrust
-    GIT_REPOSITORY https://github.com/thrust/thrust.git
-    # August 28, 2020
-    GIT_TAG        52a8bda46c5c2128414d1d47f546b486ff0be2f0
-)
-
-FetchContent_GetProperties(thrust)
-if(NOT thrust_POPULATED)
-  FetchContent_Populate(thrust)
-  # We are not using the thrust CMake targets, so no need to call `add_subdirectory()`.
-endif()
-set(THRUST_INCLUDE_DIR "${thrust_SOURCE_DIR}")
-
-# - cuco
-message("Fetching cuco")
-
-FetchContent_Declare(
-    cuco
-    GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-    GIT_TAG        d965ed8dea8f56da8e260a6130dddf3ca351c45f
-)
-
-FetchContent_GetProperties(cuco)
-if(NOT cuco_POPULATED)
-  FetchContent_Populate(cuco)
-endif()
-set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include")
-
-# - libcudacxx
-#     NOTE:  This is necessary because libcudacxx is not supported in
-#      debian cuda 10.2 packages.  Once 10.2 is deprecated
-#      we should not need this any longer.
-message("Fetching libcudacxx")
-
-FetchContent_Declare(
-    libcudacxx
-    GIT_REPOSITORY https://github.com/NVIDIA/libcudacxx.git
-    GIT_TAG        1.3.0
-    GIT_SHALLOW    true
-)
-
-FetchContent_GetProperties(libcudacxx)
-if(NOT libcudacxx_POPULATED)
-  message("populating libcudacxx")
-  FetchContent_Populate(libcudacxx)
+if(NOT DEFINED BLAS_LIBRARIES)
+  find_package( BLAS REQUIRED )
+else()
+  message(STATUS "Manually setting BLAS to ${BLAS_LIBRARIES}")
 endif()
-set(LIBCUDACXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/include")
-message("set LIBCUDACXX_INCLUDE_DIR to: ${LIBCUDACXX_INCLUDE_DIR}")
-
 
 ###################################################################################################
-# - External Projects -----------------------------------------------------------------------------
-
-# https://cmake.org/cmake/help/v3.0/module/ExternalProject.html
-include(ExternalProject)
-
-# - CUHORNET
-set(CUHORNET_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuhornet CACHE STRING "Path to cuhornet repo")
-set(CUHORNET_INCLUDE_DIR ${CUHORNET_DIR}/src/cuhornet CACHE STRING "Path to cuhornet includes")
-
-
-ExternalProject_Add(cuhornet
-  GIT_REPOSITORY    https://github.com/rapidsai/cuhornet.git
-  GIT_TAG           9cb8e8803852bd895a9c95c0fe778ad6eeefa7ad
-  PREFIX            ${CUHORNET_DIR}
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-)
+# - find CPM based dependencies  ------------------------------------------------------------------
 
-# - GUNROCK
-set(GUNROCK_DIR ${CMAKE_CURRENT_BINARY_DIR}/gunrock CACHE STRING "Path to gunrock repo")
-set(GUNROCK_INCLUDE_DIR ${GUNROCK_DIR}/src/gunrock_ext CACHE STRING "Path to gunrock includes")
-
-ExternalProject_Add(gunrock_ext
-  GIT_REPOSITORY    https://github.com/gunrock/gunrock.git
-  GIT_TAG           dev
-  PREFIX            ${GUNROCK_DIR}
-  CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-                    -DGUNROCK_BUILD_SHARED_LIBS=OFF
-                    -DGUNROCK_BUILD_TESTS=OFF
-                    -DCUDA_AUTODETECT_GENCODE=OFF
-                    -DGUNROCK_GENCODE_SM60=${GUNROCK_GENCODE_SM60}
-                    -DGUNROCK_GENCODE_SM61=${GUNROCK_GENCODE_SM61}
-                    -DGUNROCK_GENCODE_SM70=${GUNROCK_GENCODE_SM70}
-                    -DGUNROCK_GENCODE_SM72=${GUNROCK_GENCODE_SM72}
-                    -DGUNROCK_GENCODE_SM75=${GUNROCK_GENCODE_SM75}
-                    -DGUNROCK_GENCODE_SM80=${GUNROCK_GENCODE_SM80}
-                    ${GUNROCK_GENCODE}
-  BUILD_BYPRODUCTS  ${GUNROCK_DIR}/src/gunrock_ext-build/lib/libgunrock.a
-  INSTALL_COMMAND   ""
-)
-
-add_library(gunrock STATIC IMPORTED)
-
-add_dependencies(gunrock gunrock_ext)
-
-set_property(TARGET gunrock PROPERTY IMPORTED_LOCATION ${GUNROCK_DIR}/src/gunrock_ext-build/lib/libgunrock.a)
-
-# - NCCL
-if(NOT NCCL_PATH)
-    find_package(NCCL REQUIRED)
-else()
-    message("-- Manually set NCCL PATH to ${NCCL_PATH}")
-    set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include)
-    set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so)
-endif(NOT NCCL_PATH)
 
-# - raft - (header only) -----------------------------------------------------
+rapids_cpm_init()
 
-# Only cloned if RAFT_PATH env variable is not defined
 
-if(DEFINED ENV{RAFT_PATH})
-  message(STATUS "RAFT_PATH environment variable detected.")
-  message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}")
-  set(RAFT_DIR "$ENV{RAFT_PATH}")
+include(cmake/thirdparty/get_thrust.cmake)
+include(cmake/thirdparty/get_faiss.cmake)
+include(cmake/thirdparty/get_nccl.cmake)
+include(cmake/thirdparty/get_rmm.cmake)
 
-  ExternalProject_Add(raft
-    DOWNLOAD_COMMAND  ""
-    SOURCE_DIR        ${RAFT_DIR}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   "")
+include(cmake/thirdparty/get_raft.cmake)
 
-else(DEFINED ENV{RAFT_PATH})
-  message(STATUS "RAFT_PATH environment variable NOT detected, cloning RAFT")
-  set(RAFT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
+include(cmake/thirdparty/get_cuco.cmake)
+include(cmake/thirdparty/get_cuhornet.cmake)
 
-  ExternalProject_Add(raft
-    GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           f75d7b437bf1da3df749108161b8a0505fb6b7b3
-    PREFIX            ${RAFT_DIR}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   "")
+include(cmake/thirdparty/get_gunrock.cmake)
 
-  # Redefining RAFT_DIR so it coincides with the one inferred by env variable.
-  set(RAFT_DIR "${RAFT_DIR}/src/raft/")
-endif(DEFINED ENV{RAFT_PATH})
+if(BUILD_TESTS)
+  include(cmake/thirdparty/get_gtest.cmake)
+endif()
 
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
 
-# target_link_directories is added in cmake 3.13, and cmake advises to use this instead of
-# link_directoires (we should switch to target_link_directories once 3.13 becomes the minimum
-# required version).
-link_directories(
-     # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the
-     # link directories for nvcc.
-    "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
-
 add_library(cugraph SHARED
+    src/detail/utility_wrappers.cu
+    src/detail/shuffle_wrappers.cu
     src/utilities/spmv_1D.cu
     src/utilities/cython.cu
+    src/utilities/path_retrieval.cu
+    src/utilities/graph_bcast.cu
     src/structure/graph.cu
     src/linear_assignment/hungarian.cu
-    src/link_analysis/pagerank.cu
-    src/link_analysis/pagerank_1D.cu
     src/link_analysis/gunrock_hits.cpp
     src/traversal/bfs.cu
     src/traversal/sssp.cu
+    src/traversal/tsp.cu
     src/link_prediction/jaccard.cu
     src/link_prediction/overlap.cu
     src/layout/force_atlas2.cu
-    src/converters/renumber.cu
     src/converters/COOtoCSR.cu
     src/community/spectral_clustering.cu
     src/community/louvain.cu
     src/community/leiden.cu
     src/community/ktruss.cu
-    src/community/ECG.cu
+    src/community/ecg.cu
     src/community/triangles_counting.cu
     src/community/extract_subgraph_by_vertex.cu
+    src/community/egonet.cu
+    src/sampling/random_walks.cu
     src/cores/core_number.cu
     src/traversal/two_hop_neighbors.cu
     src/components/connectivity.cu
     src/centrality/katz_centrality.cu
     src/centrality/betweenness_centrality.cu
+    src/generators/generate_rmat_edgelist.cu
+    src/generators/generator_tools.cu
+    src/generators/simple_generators.cu
+    src/generators/erdos_renyi_generator.cu
     src/experimental/graph.cu
     src/experimental/graph_view.cu
+    src/experimental/coarsen_graph.cu
+    src/experimental/renumber_edgelist.cu
+    src/experimental/renumber_utils.cu
+    src/experimental/relabel.cu
+    src/experimental/induced_subgraph.cu
     src/experimental/bfs.cu
     src/experimental/sssp.cu
     src/experimental/pagerank.cu
     src/experimental/katz_centrality.cu
+    src/serialization/serializer.cu
     src/tree/mst.cu
+    src/components/weakly_connected_components.cu
+    src/structure/create_graph_from_edgelist.cpp
+    src/utilities/host_barrier.cpp
+    src/visitors/graph_envelope.cpp
+    src/visitors/visitors_factory.cpp
+    src/visitors/bfs_visitor.cpp
 )
 
-#
-# NOTE:  This dependency will force the building of cugraph to
-#        wait until after cugunrock is constructed.
-#
-add_dependencies(cugraph gunrock_ext)
-add_dependencies(cugraph raft)
+set_target_properties(cugraph
+    PROPERTIES BUILD_RPATH                         "\$ORIGIN"
+               INSTALL_RPATH                       "\$ORIGIN"
+               # set target compile options
+               CXX_STANDARD                        17
+               CXX_STANDARD_REQUIRED               ON
+               CUDA_STANDARD                       17
+               CUDA_STANDARD_REQUIRED              ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+)
+
+target_compile_options(cugraph
+            PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUGRAPH_CXX_FLAGS}>"
+                    "$<$<COMPILE_LANGUAGE:CUDA>:${CUGRAPH_CUDA_FLAGS}>"
+)
+
+# Per-thread default stream option see https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
+# The per-thread default stream does not synchronize with other streams
+target_compile_definitions(cugraph PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+
+file(WRITE "${CUGRAPH_BINARY_DIR}/fatbin.ld"
+[=[
+SECTIONS
+{
+  .nvFatBinSegment : { *(.nvFatBinSegment) }
+  .nv_fatbin : { *(.nv_fatbin) }
+}
+]=])
+target_link_options(cugraph PRIVATE "${CUGRAPH_BINARY_DIR}/fatbin.ld")
+
+add_library(cugraph::cugraph ALIAS cugraph)
 
 ###################################################################################################
 # - include paths ---------------------------------------------------------------------------------
 target_include_directories(cugraph
     PRIVATE
-    "${THRUST_INCLUDE_DIR}"
-    "${CUCO_INCLUDE_DIR}"
-    "${LIBCUDACXX_INCLUDE_DIR}"
-    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
-    "${Boost_INCLUDE_DIRS}"
-    "${RMM_INCLUDE}"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty"
-    "${CUHORNET_INCLUDE_DIR}/hornet/include"
-    "${CUHORNET_INCLUDE_DIR}/hornetsnest/include"
-    "${CUHORNET_INCLUDE_DIR}/xlib/include"
-    "${CUHORNET_INCLUDE_DIR}/primitives"
-    "${CMAKE_CURRENT_SOURCE_DIR}/src"
-    "${GUNROCK_INCLUDE_DIR}"
-    "${NCCL_INCLUDE_DIRS}"
-    "${RAFT_DIR}/cpp/include"
+        "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty"
+        "${CMAKE_CURRENT_SOURCE_DIR}/src"
+        "${NCCL_INCLUDE_DIRS}"
     PUBLIC
-    "${CMAKE_CURRENT_SOURCE_DIR}/include"
+        "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:include>"
 )
 
 ###################################################################################################
 # - link libraries --------------------------------------------------------------------------------
-
-target_link_libraries(cugraph PRIVATE
-    gunrock cublas cusparse curand cusolver cudart cuda ${NCCL_LIBRARIES})
+target_link_libraries(cugraph
+    PUBLIC
+        rmm::rmm
+        cugraph::Thrust
+        raft::raft
+    PRIVATE
+        cuco::cuco
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cusparse
+        cugraph::cuHornet
+        FAISS::FAISS
+        gunrock
+        NCCL::NCCL
+)
 
 if(OpenMP_CXX_FOUND)
 target_link_libraries(cugraph PRIVATE
@@ -461,39 +325,76 @@ target_link_libraries(cugraph PRIVATE
     ${OpenMP_CXX_LIB_NAMES})
 endif(OpenMP_CXX_FOUND)
 
+
 ###################################################################################################
 # - generate tests --------------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-    if(GTEST_FOUND)
-        # target_link_directories is added in cmake 3.13, and cmake advises to use this instead of
-        # link_directoires (we should switch to target_link_directories once 3.13 becomes the
-        # minimum required version).
-        link_directories(${GTEST_LIBRARY_DIR})
-        add_subdirectory(${CMAKE_SOURCE_DIR}/tests)
-    endif(GTEST_FOUND)
+  include(CTest)
+  add_subdirectory(tests)
 endif(BUILD_TESTS)
 
 ###################################################################################################
 # - install targets -------------------------------------------------------------------------------
 
-install(TARGETS cugraph LIBRARY
-    DESTINATION lib)
+include(CPack)
+
+install(TARGETS cugraph
+        DESTINATION lib
+        EXPORT cugraph-exports)
+
+install(DIRECTORY include/cugraph/
+        DESTINATION include/cugraph)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cugraph/version_config.hpp
+        DESTINATION include/cugraph)
 
-install(DIRECTORY include/
-    DESTINATION include/cugraph)
+################################################################################################
+# - install export -------------------------------------------------------------------------------
+set(doc_string
+[=[
+Provide targets for cuGraph.
+
+cuGraph library is a collection of GPU accelerated graph algorithms that process data found in
+[GPU DataFrames](https://github.com/rapidsai/cudf).
+
+]=])
+
+set(code_string
+[=[
+thrust_create_target(cugraph::Thrust FROM_OPTIONS)
+]=])
+
+ rapids_export(INSTALL cugraph
+    EXPORT_SET cugraph-exports
+    GLOBAL_TARGETS cugraph
+    NAMESPACE cugraph::
+    DOCUMENTATION doc_string
+    FINAL_CODE_BLOCK code_string
+    )
+
+################################################################################################
+# - build export -------------------------------------------------------------------------------
+rapids_export(BUILD cugraph
+    EXPORT_SET cugraph-exports
+    GLOBAL_TARGETS cugraph
+    NAMESPACE cugraph::
+    DOCUMENTATION doc_string
+    FINAL_CODE_BLOCK code_string
+    )
 
-install(DIRECTORY ${RAFT_DIR}/cpp/include/raft/
-    DESTINATION include/cugraph/raft)
 ###################################################################################################
 # - make documentation ----------------------------------------------------------------------------
 # requires doxygen and graphviz to be installed
 # from build directory, run make docs_cugraph
 
-# doc targets for cuGraph
-add_custom_command(OUTPUT CUGRAPH_DOXYGEN
-                   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doxygen
-                   COMMAND doxygen Doxyfile
-                   VERBATIM)
+# doc targets for cugraph
+find_package(Doxygen 1.8.11)
+if(Doxygen_FOUND)
+    add_custom_command(OUTPUT CUGRAPH_DOXYGEN
+                       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doxygen
+                       COMMAND doxygen Doxyfile
+                       VERBATIM)
 
-add_custom_target(docs_cugraph DEPENDS CUGRAPH_DOXYGEN)
+    add_custom_target(docs_cugraph DEPENDS CUGRAPH_DOXYGEN)
+endif()
diff --git a/cpp/cmake/EvalGpuArchs.cmake b/cpp/cmake/EvalGpuArchs.cmake
deleted file mode 100644
index f3918542db9..00000000000
--- a/cpp/cmake/EvalGpuArchs.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-function(evaluate_gpu_archs gpu_archs)
-  set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
-  set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
-  file(WRITE ${eval_file}
-    "
-#include <cstdio>
-#include <set>
-#include <string>
-using namespace std;
-int main(int argc, char** argv) {
-  set<string> archs;
-  int nDevices;
-  if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
-    for(int dev=0;dev<nDevices;++dev) {
-      char buff[32];
-      cudaDeviceProp prop;
-      if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
-      sprintf(buff, \"%d%d\", prop.major, prop.minor);
-      archs.insert(buff);
-    }
-  }
-  if(archs.empty()) {
-    printf(\"ALL\");
-  } else {
-    bool first = true;
-    for(set<string>::const_iterator itr=archs.begin();itr!=archs.end();++itr) {
-      printf(first? \"%s\" : \";%s\", itr->c_str());
-      first = false;
-    }
-  }
-  printf(\"\\n\");
-  return 0;
-}
-")
-  execute_process(
-    COMMAND ${CUDA_NVCC_EXECUTABLE}
-      -o ${eval_exe}
-      --run
-      ${eval_file}
-    OUTPUT_VARIABLE __gpu_archs
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  set(__gpu_archs_filtered "${__gpu_archs}")
-  foreach(arch ${__gpu_archs})
-    if (arch VERSION_LESS 60)
-      list(REMOVE_ITEM __gpu_archs_filtered ${arch})
-    endif()
-  endforeach()
-  if (NOT __gpu_archs_filtered)
-    message(FATAL_ERROR "No supported GPU arch found on this system")
-  endif()
-  message("Auto detection of gpu-archs: ${__gpu_archs_filtered}")
-  set(${gpu_archs} ${__gpu_archs_filtered} PARENT_SCOPE)
-endfunction(evaluate_gpu_archs)
diff --git a/cpp/cmake/Modules/ConfigureArrow.cmake b/cpp/cmake/Modules/ConfigureArrow.cmake
deleted file mode 100644
index b27e53dd415..00000000000
--- a/cpp/cmake/Modules/ConfigureArrow.cmake
+++ /dev/null
@@ -1,98 +0,0 @@
-set(ARROW_ROOT ${CMAKE_BINARY_DIR}/arrow)
-
-set(ARROW_CMAKE_ARGS " -DARROW_WITH_LZ4=OFF"
-                     " -DARROW_WITH_ZSTD=OFF"
-                     " -DARROW_WITH_BROTLI=OFF"
-                     " -DARROW_WITH_SNAPPY=OFF"
-                     " -DARROW_WITH_ZLIB=OFF"
-                     " -DARROW_BUILD_STATIC=ON"
-                     " -DARROW_BUILD_SHARED=OFF"
-                     " -DARROW_BOOST_USE_SHARED=ON"
-                     " -DARROW_BUILD_TESTS=OFF"
-                     " -DARROW_TEST_LINKAGE=OFF"
-                     " -DARROW_TEST_MEMCHECK=OFF"
-                     " -DARROW_BUILD_BENCHMARKS=OFF"
-                     " -DARROW_IPC=ON"
-                     " -DARROW_COMPUTE=OFF"
-                     " -DARROW_CUDA=OFF"
-                     " -DARROW_JEMALLOC=OFF"
-                     " -DARROW_BOOST_VENDORED=OFF"
-                     " -DARROW_PYTHON=OFF"
-                     " -DARROW_USE_GLOG=OFF"
-                     " -DCMAKE_VERBOSE_MAKEFILE=ON")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake"
-               "${ARROW_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${ARROW_ROOT}/build")
-file(MAKE_DIRECTORY "${ARROW_ROOT}/install")
-
-execute_process(
-    COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-    RESULT_VARIABLE ARROW_CONFIG
-    WORKING_DIRECTORY ${ARROW_ROOT})
-
-if(ARROW_CONFIG)
-    message(FATAL_ERROR "Configuring Arrow failed: " ${ARROW_CONFIG})
-endif(ARROW_CONFIG)
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif($ENV{PARALLEL_LEVEL})
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(STATUS "ARROW BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(STATUS "ARROW BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif(${NUM_JOBS} EQUAL 1)
-else()
-    message(STATUS "ARROW BUILD: Enabling Parallel CMake build with all threads")
-endif(${NUM_JOBS})
-
-execute_process(
-    COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-    RESULT_VARIABLE ARROW_BUILD
-    WORKING_DIRECTORY ${ARROW_ROOT}/build)
-
-if(ARROW_BUILD)
-    message(FATAL_ERROR "Building Arrow failed: " ${ARROW_BUILD})
-endif(ARROW_BUILD)
-
-set(ARROW_GENERATED_IPC_DIR 
-    "${ARROW_ROOT}/build/src/arrow/ipc")
-
-configure_file(${ARROW_GENERATED_IPC_DIR}/File_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/File_generated.h COPYONLY)
-configure_file(${ARROW_GENERATED_IPC_DIR}/Message_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/Message_generated.h COPYONLY)
-configure_file(${ARROW_GENERATED_IPC_DIR}/Schema_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/Schema_generated.h COPYONLY)
-configure_file(${ARROW_GENERATED_IPC_DIR}/Tensor_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/Tensor_generated.h COPYONLY)
-
-message(STATUS "Arrow installed here: " ${ARROW_ROOT}/install)
-set(ARROW_LIBRARY_DIR "${ARROW_ROOT}/install/lib")
-set(ARROW_INCLUDE_DIR "${ARROW_ROOT}/install/include")
-
-find_library(ARROW_LIB arrow
-             NO_DEFAULT_PATH
-             HINTS "${ARROW_LIBRARY_DIR}")
-
-if(ARROW_LIB)
-    message(STATUS "Arrow library: " ${ARROW_LIB})
-    set(ARROW_FOUND TRUE)
-endif(ARROW_LIB)
-
-set(FLATBUFFERS_ROOT "${ARROW_ROOT}/build/flatbuffers_ep-prefix/src/flatbuffers_ep-install")
-
-message(STATUS "FlatBuffers installed here: " ${FLATBUFFERS_ROOT})
-set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_ROOT}/include")
-set(FLATBUFFERS_LIBRARY_DIR "${FLATBUFFERS_ROOT}/lib")
-
-add_definitions(-DARROW_METADATA_V4)
-add_definitions(-DARROW_VERSION=1210)
-
-
-
-
-
-
diff --git a/cpp/cmake/Modules/ConfigureGoogleTest.cmake b/cpp/cmake/Modules/ConfigureGoogleTest.cmake
deleted file mode 100644
index 9fac40f4649..00000000000
--- a/cpp/cmake/Modules/ConfigureGoogleTest.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-set(GTEST_ROOT "${CMAKE_BINARY_DIR}/googletest")
-
-set(GTEST_CMAKE_ARGS "")
-                     #" -Dgtest_build_samples=ON" 
-                     #" -DCMAKE_VERBOSE_MAKEFILE=ON")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/GoogleTest.CMakeLists.txt.cmake"
-               "${GTEST_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${GTEST_ROOT}/build")
-file(MAKE_DIRECTORY "${GTEST_ROOT}/install")
-
-execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
-                RESULT_VARIABLE GTEST_CONFIG
-                WORKING_DIRECTORY ${GTEST_ROOT})
-
-if(GTEST_CONFIG)
-    message(FATAL_ERROR "Configuring GoogleTest failed: " ${GTEST_CONFIG})
-endif(GTEST_CONFIG)
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif($ENV{PARALLEL_LEVEL})
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(STATUS "GTEST BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(STATUS "GTEST BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif(${NUM_JOBS} EQUAL 1)
-else()
-    message(STATUS "GTEST BUILD: Enabling Parallel CMake build with all threads")
-endif(${NUM_JOBS})
-
-execute_process(COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-                RESULT_VARIABLE GTEST_BUILD
-                WORKING_DIRECTORY ${GTEST_ROOT}/build)
-
-if(GTEST_BUILD)
-    message(FATAL_ERROR "Building GoogleTest failed: " ${GTEST_BUILD})
-endif(GTEST_BUILD)
-
-message(STATUS "GoogleTest installed here: " ${GTEST_ROOT}/install)
-set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
-set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
-set(GTEST_FOUND TRUE)
-
diff --git a/cpp/cmake/Modules/FindNCCL.cmake b/cpp/cmake/Modules/FindNCCL.cmake
deleted file mode 100644
index 0f673707444..00000000000
--- a/cpp/cmake/Modules/FindNCCL.cmake
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Based on FindPNG.cmake from cmake 3.14.3
-
-#[=======================================================================[.rst:
-FindNCCL
---------
-
-Find libnccl, the NVIDIA Collective Communication Library. A hint to find NCCL
-can be provided by setting NCCL_INSTALL_DIR.
-
-Imported targets
-^^^^^^^^^^^^^^^^
-
-This module defines the following :prop_tgt:`IMPORTED` target:
-
-``NCCL::NCCL``
-  The libnccl library, if found.
-
-Result variables
-^^^^^^^^^^^^^^^^
-
-This module will set the following variables in your project:
-
-``NCCL_INCLUDE_DIRS``
-  where to find nccl.h , etc.
-``NCCL_LIBRARIES``
-  the libraries to link against to use NCCL.
-``NCCL_FOUND``
-  If false, do not try to use NCCL.
-``NCCL_VERSION_STRING``
-  the version of the NCCL library found
-
-#]=======================================================================]
-
-find_path(NCCL_NCCL_INCLUDE_DIR nccl.h HINTS ${NCCL_INSTALL_DIR} PATH_SUFFIXES include)
-
-#TODO: Does this need to support finding the static library?
-
-list(APPEND NCCL_NAMES nccl libnccl)
-set(_NCCL_VERSION_SUFFIXES 2)
-
-foreach(v IN LISTS _NCCL_VERSION_SUFFIXES)
-  list(APPEND NCCL_NAMES nccl${v} libnccl${v})
-endforeach()
-unset(_NCCL_VERSION_SUFFIXES)
-# For compatibility with versions prior to this multi-config search, honor
-# any NCCL_LIBRARY that is already specified and skip the search.
-if(NOT NCCL_LIBRARY)
-  find_library(NCCL_LIBRARY_RELEASE NAMES ${NCCL_NAMES} HINTS ${NCCL_INSTALL_DIR} PATH_SUFFIXES lib)
-  include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
-  select_library_configurations(NCCL)
-  mark_as_advanced(NCCL_LIBRARY_RELEASE)
-endif()
-unset(NCCL_NAMES)
-
-# Set by select_library_configurations(), but we want the one from
-# find_package_handle_standard_args() below.
-unset(NCCL_FOUND)
-
-if (NCCL_LIBRARY AND NCCL_NCCL_INCLUDE_DIR)
-  set(NCCL_INCLUDE_DIRS ${NCCL_NCCL_INCLUDE_DIR} )
-  set(NCCL_LIBRARY ${NCCL_LIBRARY})
-
-  if(NOT TARGET NCCL::NCCL)
-    add_library(NCCL::NCCL UNKNOWN IMPORTED)
-    set_target_properties(NCCL::NCCL PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIRS}")
-    if(EXISTS "${NCCL_LIBRARY}")
-      set_target_properties(NCCL::NCCL PROPERTIES
-        IMPORTED_LINK_INTERFACE_LANGUAGES "C"
-        IMPORTED_LOCATION "${NCCL_LIBRARY}")
-    endif()
-  endif()
-endif ()
-
-if (NCCL_NCCL_INCLUDE_DIR AND EXISTS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h")
-  file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_major_version_str REGEX "^#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+")
-  string(REGEX REPLACE "^#define[ \t]+NCCL_MAJOR[ \t]+([0-9]+)" "\\1" nccl_major_version_str "${nccl_major_version_str}")
-
-  file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_minor_version_str REGEX "^#define[ \t]+NCCL_MINOR[ \t]+[0-9]+")
-  string(REGEX REPLACE "^#define[ \t]+NCCL_MINOR[ \t]+([0-9]+)" "\\1" nccl_minor_version_str "${nccl_minor_version_str}")
-
-  file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_patch_version_str REGEX "^#define[ \t]+NCCL_PATCH[ \t]+[0-9]+")
-  string(REGEX REPLACE "^#define[ \t]+NCCL_PATCH[ \t]+([0-9]+)" "\\1" nccl_patch_version_str "${nccl_patch_version_str}")
-
-  file(STRINGS "${NCCL_NCCL_INCLUDE_DIR}/nccl.h" nccl_suffix_version_str REGEX "^#define[ \t]+NCCL_SUFFIX[ \t]+\".*\"")
-  string(REGEX REPLACE "^#define[ \t]+NCCL_SUFFIX[ \t]+\"(.*)\"" "\\1" nccl_suffix_version_str "${nccl_suffix_version_str}")
-
-  set(NCCL_VERSION_STRING "${nccl_major_version_str}.${nccl_minor_version_str}.${nccl_patch_version_str}${nccl_suffix_version_str}")
-
-  unset(nccl_major_version_str)
-  unset(nccl_minor_version_str)
-  unset(nccl_patch_version_str)
-  unset(nccl_suffix_version_str)
-endif ()
-
-include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
-find_package_handle_standard_args(NCCL
-                                  REQUIRED_VARS NCCL_LIBRARY NCCL_NCCL_INCLUDE_DIR
-                                  VERSION_VAR NCCL_VERSION_STRING)
-
-mark_as_advanced(NCCL_NCCL_INCLUDE_DIR NCCL_LIBRARY)
diff --git a/cpp/cmake/Modules/Version.cmake b/cpp/cmake/Modules/Version.cmake
new file mode 100644
index 00000000000..15046784175
--- /dev/null
+++ b/cpp/cmake/Modules/Version.cmake
@@ -0,0 +1,18 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Generate version_config.hpp from the version found in CMakeLists.txt
+function(write_version)
+  message(STATUS "CUGRAPH VERSION: ${CUGRAPH_VERSION}")
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_config.hpp.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/include/cugraph/version_config.hpp @ONLY)
+endfunction(write_version)
diff --git a/cpp/cmake/Templates/Arrow.CMakeLists.txt.cmake b/cpp/cmake/Templates/Arrow.CMakeLists.txt.cmake
deleted file mode 100644
index b1eaf3f0efa..00000000000
--- a/cpp/cmake/Templates/Arrow.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-
-include(ExternalProject)
-
-ExternalProject_Add(Arrow
-                    GIT_REPOSITORY    https://github.com/apache/arrow.git
-                    GIT_TAG           apache-arrow-0.12.1
-                    SOURCE_DIR        "${ARROW_ROOT}/arrow"
-                    SOURCE_SUBDIR     "cpp"
-                    BINARY_DIR        "${ARROW_ROOT}/build"
-                    INSTALL_DIR       "${ARROW_ROOT}/install"
-                    CMAKE_ARGS        ${ARROW_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${ARROW_ROOT}/install)
-
-
-
-
-
-
-
diff --git a/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake b/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
deleted file mode 100644
index 66e1dc85a50..00000000000
--- a/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-
-include(ExternalProject)
-
-ExternalProject_Add(GoogleTest
-                    GIT_REPOSITORY    https://github.com/google/googletest.git
-                    GIT_TAG           release-1.8.0
-                    SOURCE_DIR        "${GTEST_ROOT}/googletest"
-                    BINARY_DIR        "${GTEST_ROOT}/build"
-                    INSTALL_DIR		  "${GTEST_ROOT}/install"
-                    CMAKE_ARGS        ${GTEST_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${GTEST_ROOT}/install)
-
-
-
-
-
-
-
-
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
new file mode 100644
index 00000000000..7b9ab17bef6
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -0,0 +1,37 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_cuco VERSION)
+
+    if(TARGET cuco::cuco)
+      return()
+    endif()
+
+    rapids_cpm_find(cuco ${VERSION}
+      GLOBAL_TARGETS cuco cuco::cuco
+      CPM_ARGS
+        GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+        GIT_TAG        b1fea0cbe4c384160740af00f7c8760846539abb
+        OPTIONS        "BUILD_TESTS OFF"
+                       "BUILD_BENCHMARKS OFF"
+                       "BUILD_EXAMPLES OFF"
+    )
+
+    add_library(cuco::cuco ALIAS cuco)
+
+endfunction()
+
+find_and_configure_cuco(0.0.1)
diff --git a/cpp/cmake/thirdparty/get_cuhornet.cmake b/cpp/cmake/thirdparty/get_cuhornet.cmake
new file mode 100644
index 00000000000..28c83161ff4
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_cuhornet.cmake
@@ -0,0 +1,45 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_cuhornet)
+
+    # We are not using the cuhornet CMake targets, so no need to call `add_subdirectory()`,
+    # or to use CPM
+    FetchContent_Declare(
+        cuhornet
+        GIT_REPOSITORY    https://github.com/rapidsai/cuhornet.git
+        GIT_TAG           261399356e62bd76fa7628880f1a847aee713eed
+        SOURCE_SUBDIR     hornet
+    )
+    FetchContent_GetProperties(cuhornet)
+
+    if(NOT cuhornet_POPULATED)
+        FetchContent_Populate(cuhornet)
+    endif()
+
+    if(NOT TARGET cugraph::cuHornet)
+        add_library(cugraph::cuHornet IMPORTED INTERFACE GLOBAL)
+        target_include_directories(cugraph::cuHornet INTERFACE
+            "${cuhornet_SOURCE_DIR}/hornet/include"
+            "${cuhornet_SOURCE_DIR}/hornetsnest/include"
+            "${cuhornet_SOURCE_DIR}/xlib/include"
+            "${cuhornet_SOURCE_DIR}/primitives"
+            )
+    endif()
+endfunction()
+
+
+find_and_configure_cuhornet()
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
new file mode 100644
index 00000000000..a65401579cb
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -0,0 +1,53 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_faiss)
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN} )
+
+    rapids_find_generate_module(FAISS
+        HEADER_NAMES  faiss/IndexFlat.h
+        LIBRARY_NAMES faiss
+    )
+
+    rapids_cpm_find(FAISS ${PKG_VERSION}
+        GLOBAL_TARGETS  faiss
+        CPM_ARGS
+          GIT_REPOSITORY  https://github.com/facebookresearch/faiss.git
+          GIT_TAG         ${PKG_PINNED_TAG}
+          OPTIONS
+            "FAISS_ENABLE_PYTHON OFF"
+            "BUILD_SHARED_LIBS OFF"
+            "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
+            "FAISS_ENABLE_GPU ON"
+            "BUILD_TESTING OFF"
+            "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+    )
+
+    if(FAISS_ADDED)
+      set(FAISS_GPU_HEADERS ${FAISS_SOURCE_DIR} PARENT_SCOPE)
+    endif()
+
+    if(TARGET faiss AND NOT TARGET FAISS::FAISS)
+        add_library(FAISS::FAISS ALIAS faiss)
+    endif()
+
+endfunction()
+
+find_and_configure_faiss(VERSION    1.7.0
+                         PINNED_TAG  bde7c0027191f29c9dadafe4f6e68ca0ee31fb30
+                        )
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
new file mode 100644
index 00000000000..e413cad7601
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -0,0 +1,43 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_gtest VERSION)
+
+    if(TARGET GTest::gtest)
+        return()
+    endif()
+
+    rapids_cpm_find(GTest ${VERSION}
+        GLOBAL_TARGETS  gmock gmock_main gtest gtest_main GTest::gmock GTest::gtest GTest::gtest_main
+        CPM_ARGS
+            GIT_REPOSITORY  https://github.com/google/googletest.git
+            GIT_TAG         release-${VERSION}
+            GIT_SHALLOW     TRUE
+            OPTIONS         "INSTALL_GTEST ON"
+            # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
+            FIND_PACKAGE_ARGUMENTS "CONFIG"
+    )
+
+    if(NOT TARGET GTest::gtest)
+        add_library(GTest::gmock ALIAS gmock)
+        add_library(GTest::gmock_main ALIAS gmock_main)
+        add_library(GTest::gtest ALIAS gtest)
+        add_library(GTest::gtest_main ALIAS gtest_main)
+    endif()
+
+endfunction()
+
+find_and_configure_gtest(1.10.0)
diff --git a/cpp/cmake/thirdparty/get_gunrock.cmake b/cpp/cmake/thirdparty/get_gunrock.cmake
new file mode 100644
index 00000000000..056cd4bd5ea
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_gunrock.cmake
@@ -0,0 +1,64 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_gunrock VERSION)
+
+    if(NOT TARGET gunrock)
+        set(GUNROCK_GENCODE_SM60 OFF)
+        set(GUNROCK_GENCODE_SM61 OFF)
+        set(GUNROCK_GENCODE_SM70 OFF)
+        set(GUNROCK_GENCODE_SM72 OFF)
+        set(GUNROCK_GENCODE_SM75 OFF)
+        set(GUNROCK_GENCODE_SM80 OFF)
+
+        foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+            string(REPLACE "-real" "" arch ${arch})
+            set(GUNROCK_GENCODE_SM${arch} "ON")
+        endforeach()
+
+        # FIXME: gunrock is still using ExternalProject instead of CPM, as version 1.2
+        # doesn't work with CPM
+
+        include(ExternalProject)
+
+        set(GUNROCK_DIR ${CMAKE_CURRENT_BINARY_DIR}/gunrock)
+        ExternalProject_Add(gunrock_ext
+          GIT_REPOSITORY    https://github.com/gunrock/gunrock.git
+          GIT_TAG           v${VERSION}
+          PREFIX            ${GUNROCK_DIR}
+          CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                            -DGUNROCK_BUILD_SHARED_LIBS=OFF
+                            -DGUNROCK_BUILD_TESTS=OFF
+                            -DCUDA_AUTODETECT_GENCODE=OFF
+                            -DGUNROCK_GENCODE_SM60=${GUNROCK_GENCODE_SM60}
+                            -DGUNROCK_GENCODE_SM61=${GUNROCK_GENCODE_SM61}
+                            -DGUNROCK_GENCODE_SM70=${GUNROCK_GENCODE_SM70}
+                            -DGUNROCK_GENCODE_SM72=${GUNROCK_GENCODE_SM72}
+                            -DGUNROCK_GENCODE_SM75=${GUNROCK_GENCODE_SM75}
+                            -DGUNROCK_GENCODE_SM80=${GUNROCK_GENCODE_SM80}
+          BUILD_BYPRODUCTS  ${GUNROCK_DIR}/src/gunrock_ext-build/lib/libgunrock.a
+          INSTALL_COMMAND   ""
+        )
+
+        add_library(gunrock STATIC IMPORTED)
+        add_dependencies(gunrock gunrock_ext)
+        set_property(TARGET gunrock PROPERTY IMPORTED_LOCATION "${GUNROCK_DIR}/src/gunrock_ext-build/lib/libgunrock.a")
+        target_include_directories(gunrock INTERFACE "${GUNROCK_DIR}/src/gunrock_ext")
+    endif()
+endfunction()
+
+
+find_and_configure_gunrock(1.2)
diff --git a/cpp/cmake/thirdparty/get_nccl.cmake b/cpp/cmake/thirdparty/get_nccl.cmake
new file mode 100644
index 00000000000..30ec976f27c
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nccl.cmake
@@ -0,0 +1,42 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_nccl)
+
+    if(TARGET NCCL::NCCL)
+        return()
+    endif()
+
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN} )
+
+    rapids_find_generate_module(NCCL
+        HEADER_NAMES  nccl.h
+        LIBRARY_NAMES nccl
+    )
+
+    # Currently NCCL has no CMake build-system so we require
+    # it built and installed on the machine already
+    rapids_find_package(NCCL REQUIRED)
+
+endfunction()
+
+find_and_configure_nccl()
+
+
+
+
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
new file mode 100644
index 00000000000..a819d7158e1
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -0,0 +1,47 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_raft)
+
+    set(oneValueArgs VERSION FORK PINNED_TAG)
+    cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN} )
+
+    rapids_cpm_find(raft ${PKG_VERSION}
+      GLOBAL_TARGETS      raft::raft
+      BUILD_EXPORT_SET    cugraph-exports
+      INSTALL_EXPORT_SET  cugraph-exports
+        CPM_ARGS
+            GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
+            GIT_TAG        ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR  cpp
+            OPTIONS "BUILD_TESTS OFF"
+    )
+
+    message(VERBOSE "CUGRAPH: Using RAFT located in ${raft_SOURCE_DIR}")
+
+endfunction()
+
+set(CUGRAPH_MIN_VERSION_raft "${CUGRAPH_VERSION_MAJOR}.${CUGRAPH_VERSION_MINOR}.00")
+set(CUGRAPH_BRANCH_VERSION_raft "${CUGRAPH_VERSION_MAJOR}.${CUGRAPH_VERSION_MINOR}")
+
+
+# Change pinned tag and fork here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# RPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_raft(VERSION    ${CUGRAPH_MIN_VERSION_raft}
+                        FORK       rapidsai
+                        PINNED_TAG branch-${CUGRAPH_BRANCH_VERSION_raft}
+                        )
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
new file mode 100644
index 00000000000..aecb6489f92
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -0,0 +1,47 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_rmm VERSION)
+
+    if(${VERSION} MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+        set(MAJOR_AND_MINOR "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}")
+    else()
+        set(MAJOR_AND_MINOR "${VERSION}")
+    endif()
+
+    if(TARGET rmm::rmm)
+        return()
+    endif()
+
+    rapids_cpm_find(rmm ${VERSION}
+        GLOBAL_TARGETS      rmm::rmm
+        BUILD_EXPORT_SET    cugraph-exports
+        INSTALL_EXPORT_SET  cugraph-exports
+        CPM_ARGS
+            GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
+            GIT_TAG         branch-${MAJOR_AND_MINOR}
+            GIT_SHALLOW     TRUE
+            OPTIONS         "BUILD_TESTS OFF"
+                            "BUILD_BENCHMARKS OFF"
+                            "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}"
+                            "DISABLE_DEPRECATION_WARNING ${DISABLE_DEPRECATION_WARNING}"
+    )
+
+endfunction()
+
+set(CUGRAPH_MIN_VERSION_rmm "${CUGRAPH_VERSION_MAJOR}.${CUGRAPH_VERSION_MINOR}.00")
+
+find_and_configure_rmm(${CUGRAPH_MIN_VERSION_rmm})
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
new file mode 100644
index 00000000000..86fcffed5d2
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -0,0 +1,29 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_thrust VERSION)
+
+    rapids_cpm_find(Thrust ${VERSION}
+        CPM_ARGS
+            GIT_REPOSITORY https://github.com/thrust/thrust.git
+            GIT_TAG        ${VERSION}
+    )
+
+    thrust_create_target(cugraph::Thrust FROM_OPTIONS)
+
+endfunction()
+
+find_and_configure_thrust(1.12.0)
diff --git a/cpp/cmake/version_config.hpp.in b/cpp/cmake/version_config.hpp.in
new file mode 100644
index 00000000000..c669d1b97f3
--- /dev/null
+++ b/cpp/cmake/version_config.hpp.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#define CUGRAPH_VERSION_MAJOR @CUGRAPH_VERSION_MAJOR@
+#define CUGRAPH_VERSION_MINOR @CUGRAPH_VERSION_MINOR@
+#define CUGRAPH_VERSION_PATCH @CUGRAPH_VERSION_PATCH@
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
new file mode 100644
index 00000000000..b369183a262
--- /dev/null
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -0,0 +1,281 @@
+# cuGraph C++ Developer Guide
+
+This document serves as a guide for contributors to cuGraph C++ code. Developers should also refer 
+to these additional files for further documentation of cuGraph best practices.
+
+* [Documentation Guide](TODO) for guidelines on documenting cuGraph code.
+* [Testing Guide](TODO) for guidelines on writing unit tests.
+* [Benchmarking Guide](TODO) for guidelines on writing unit benchmarks.
+
+# Overview
+
+cuGraph includes a C++ library that provides GPU-accelerated graph algorithms for processing 
+sparse graphs.
+
+## Lexicon
+
+This section defines terminology used within cuGraph
+
+### COO
+
+COOrdinate format is one of the standard formats for representing graph data.  In COO format the
+graph is represented as an array of source vertex ids, an array of destination vertex ids, and an
+optional array of edge weights.  Edge i is identified by source_vertex_id[i], destination_vertex_id[i]
+and weight[i].
+
+### MORE
+
+# Directory Structure and File Naming
+
+External/public cuGraph APIs are grouped based on functionality into an appropriately titled 
+header file  in `cugraph/cpp/include/`. For example, `cugraph/cpp/include/graph.hpp` 
+contains the definition of the (legacy) graph objects. Note the  `.hpp` 
+file extension used to indicate a C++ header file.
+
+Header files should use the `#pragma once` include guard. 
+
+## File extensions
+
+- `.hpp` : C++ header files
+- `.cpp` : C++ source files
+- `.cu`  : CUDA C++ source files
+- `.cuh` : Headers containing CUDA device code
+
+Header files and source files should use `.hpp` and `.cpp` extensions unless they must
+be compiled by nvcc.  `.cu` and `.cuh` files are more expensive to compile, so we want
+to minimize the use of these files to only when necessary.  A good indicator of the need
+to use a `.cu` or `.cuh` file is the inclusion of `__device__` and other
+symbols that are only recognized by `nvcc`. Another indicator is Thrust
+algorithm APIs with a device execution policy (always `rmm::exec_policy` in cuGraph).
+
+## Code and Documentation Style and Formatting
+
+cuGraph code uses [snake_case](https://en.wikipedia.org/wiki/Snake_case) for all names except in a 
+few cases: unit tests and test case names may use Pascal case, aka 
+[UpperCamelCase](https://en.wikipedia.org/wiki/Camel_case). We do not use
+[Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation), except for the following examples:
+ * device data variables should be prefaced by d_ if it makes the intent clearer
+ * host data variables should be prefaced by h_ if it makes the intent clearer
+ * template parameters defining a type should be suffixed with _t
+ * private member variables are typically suffixed with an underscore
+
+```c++
+template <typename graph_t>
+void algorithm_function(graph_t const &g)
+{
+  ...
+}
+
+template <typename vertex_t>
+class utility_class 
+{
+  ...
+ private:
+  vertex_t num_vertices_{};
+}
+```
+
+C++ formatting is enforced using `clang-format`. You should configure `clang-format` on your 
+machine to use the `cugraph/cpp/.clang-format` configuration file, and run `clang-format` on all 
+changed code before committing it. The easiest way to do this is to configure your editor to 
+"format on save".
+
+Aspects of code style not discussed in this document and not automatically enforceable are typically
+caught during code review, or not enforced.
+
+### C++ Guidelines
+
+In general, we recommend following 
+[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines). We also 
+recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/watch?v=W2tWOdzgXHA), 
+and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives." 
+
+ * Prefer algorithms from STL and Thrust to raw loops.
+ * Prefer cugraph and RMM to raw pointers and raw memory allocation.
+
+Documentation is discussed in the [Documentation Guide](TODO).
+
+### Includes
+
+The following guidelines apply to organizing `#include` lines.
+
+ * Group includes by library (e.g. cuGraph, RMM, Thrust, STL). `clang-format` will respect the 
+   groupings and sort the individual includes within a group lexicographically.
+ * Separate groups by a blank line.
+ * Order the groups from "nearest" to "farthest". In other words, local includes, then includes 
+   from other RAPIDS libraries, then includes from related libraries, like `<thrust/...>`, then 
+   includes from dependencies installed with cuGraph, and then standard headers (for example `<string>`, 
+   `<iostream>`).
+ * Use <> instead of "" unless the header is in the same directory as the source file.
+ * Tools like `clangd` often auto-insert includes when they can, but they usually get the grouping
+   and brackets wrong.
+ * Always check that includes are only necessary for the file in which they are included. 
+   Try to avoid excessive including especially in header files. Double check this when you remove 
+   code.
+
+# cuGraph Data Structures
+
+Application data in cuGraph is contained in graph objects, but there are a variety of other
+data structures you will use when developing cuGraph code.
+
+## Views and Ownership
+
+Resource ownership is an essential concept in cuGraph. In short, an "owning" object owns a 
+resource (such as device memory). It acquires that resource during construction and releases the 
+resource in destruction ([RAII](https://en.cppreference.com/w/cpp/language/raii)). A "non-owning"
+object does not own resources. Any class in cuGraph with the `*_view` suffix is non-owning.
+
+## `rmm::device_memory_resource`<a name="memory_resource"></a>
+
+cuGraph allocates all device memory via RMM memory resources (MR). See the 
+[RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
+
+## Streams
+
+CUDA streams are not yet exposed in external cuGraph APIs.
+
+We are currently investigating the best technique for exposing this.
+
+### Memory Management
+
+cuGraph code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
+use `device_memory_resource`(*)s for device memory allocation with automated lifetime management.
+
+#### `rmm::device_buffer`
+Allocates a specified number of bytes of untyped, uninitialized device memory using a 
+`device_memory_resource`. If no resource is explicitly provided, uses 
+`rmm::mr::get_current_device_resource()`. 
+
+`rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the 
+`device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the 
+device memory from one `device_buffer` to another.
+
+```c++
+// Allocates at least 100 bytes of uninitialized device memory 
+// using the specified resource and stream
+rmm::device_buffer buff(100, stream, mr); 
+void * raw_data = buff.data(); // Raw pointer to underlying device memory
+
+// Deep copies `buff` into `copy` on `stream`
+rmm::device_buffer copy(buff, stream); 
+
+// Moves contents of `buff` into `moved_to`
+rmm::device_buffer moved_to(std::move(buff)); 
+
+custom_memory_resource *mr...;
+// Allocates 100 bytes from the custom_memory_resource
+rmm::device_buffer custom_buff(100, mr, stream); 
+```
+
+#### `rmm::device_uvector<T>`
+
+Similar to a `rmm::device_vector`, allocates a contiguous set of elements in device memory but with
+key differences:
+- As an optimization, elements are uninitialized and no synchronization occurs at construction.
+This limits the types `T` to trivially copyable types.
+- All operations are stream ordered (i.e., they accept a `cuda_stream_view` specifying the stream 
+on which the operation is performed).
+
+## Namespaces
+
+### External
+All public cuGraph APIs should be placed in the `cugraph` namespace. Example:
+```c++
+namespace cugraph{
+   void public_function(...);
+} // namespace cugraph
+```
+
+### Internal
+
+Many functions are not meant for public use, so place them in either the `detail` or an *anonymous* 
+namespace, depending on the situation.
+
+#### `detail` namespace
+
+Functions or objects that will be used across *multiple* translation units (i.e., source files), 
+should be exposed in an internal header file and placed in the `detail` namespace. Example:
+
+```c++
+// some_utilities.hpp
+namespace cugraph{
+namespace detail{
+void reusable_helper_function(...);
+} // namespace detail
+} // namespace cugraph
+```
+
+#### Anonymous namespace
+
+Functions or objects that will only be used in a *single* translation unit should be defined in an 
+*anonymous* namespace in the source file where it is used. Example:
+
+```c++
+// some_file.cpp
+namespace{
+void isolated_helper_function(...);
+} // anonymous namespace
+```
+
+[**Anonymous namespaces should *never* be used in a header file.**](https://wiki.sei.cmu.edu/confluence/display/cplusplus/DCL59-CPP.+Do+not+define+an+unnamed+namespace+in+a+header+file) 
+
+# Error Handling
+
+cuGraph follows conventions (and provides utilities) enforcing compile-time and run-time 
+conditions and detecting and handling CUDA errors. Communication of errors is always via C++ 
+exceptions.
+
+## Runtime Conditions
+
+Use the `CUGRAPH_EXPECTS` macro to enforce runtime conditions necessary for correct execution.
+
+Example usage:
+```c++
+CUGRAPH_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
+```
+
+The first argument is the conditional expression expected to resolve to  `true`  under normal 
+conditions. If the conditional evaluates to  `false`, then an error has occurred and an instance of  `cugraph::logic_error` is thrown. The second argument to  `CUGRAPH_EXPECTS` is a short description of the 
+error that has occurred and is used for the exception's `what()` message. 
+
+There are times where a particular code path, if reached, should indicate an error no matter what. 
+For example, often the `default` case of a `switch` statement represents an invalid alternative. 
+Use the `CUGRAPH_FAIL` macro for such errors. This is effectively the same as calling 
+`CUGRAPH_EXPECTS(false, reason)`.
+
+Example:
+```c++
+CUGRAPH_FAIL("This code path should not be reached.");
+```
+
+### CUDA Error Checking
+
+Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This 
+macro throws a `cugraph::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The 
+thrown exception includes a description of the CUDA error code in it's  `what()`  message.
+
+Example:
+
+```c++
+CUDA_TRY( cudaMemcpy(&dst, &src, num_bytes) );
+```
+
+## Compile-Time Conditions
+
+Use `static_assert` to enforce compile-time conditions. For example,
+
+```c++
+template <typename T>
+void trivial_types_only(T t){
+   static_assert(std::is_trivial<T>::value, "This function requires a trivial type.");
+...
+}
+```
+
+# Data Types
+
+TBD
+
+# Type Dispatcher
+
+TBD
diff --git a/cpp/include/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
similarity index 69%
rename from cpp/include/algorithms.hpp
rename to cpp/include/cugraph/algorithms.hpp
index a57e550521e..2e0d0f055c8 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,84 +15,16 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <graph.hpp>
-#include <internals.hpp>
-#include <raft/handle.hpp>
+#include <cugraph/dendrogram.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
 
-namespace cugraph {
+#include <cugraph/internals.hpp>
+#include <cugraph/legacy/graph.hpp>
 
-/**
- * @brief     Find the PageRank vertex values for a graph.
- *
- * cuGraph computes an approximation of the Pagerank eigenvector using the power method.
- * The number of iterations depends on the properties of the network itself; it increases
- * when the tolerance descreases and/or alpha increases toward the limiting value of 1.
- * The user is free to use default values or to provide inputs for the initial guess,
- * tolerance and maximum number of iterations.
+#include <raft/handle.hpp>
 
- *
- * @throws                           cugraph::logic_error with a custom message when an error
- occurs.
- *
- * @tparam VT                        Type of vertex identifiers. Supported value : int (signed,
- 32-bit)
- * @tparam ET                        Type of edge identifiers. Supported value : int (signed,
- 32-bit)
- * @tparam WT                        Type of edge weights. Supported value : float or double.
- *
- * @param[in] handle                 Library handle (RAFT). If a communicator is set in the handle,
- the multi GPU version will be selected.
- * @param[in] graph                  cuGraph graph descriptor, should contain the connectivity
- information as a transposed adjacency list (CSC). Edge weights are not used for this algorithm.
- * @param[in] alpha                  The damping factor alpha represents the probability to follow
- an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a
- random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0.
- *                                   The initial guess must not be the vector of 0s. Any value other
- than 1 or 0 is treated as an invalid value.
- * @param[in] pagerank               Array of size V. Should contain the initial guess if
- has_guess=true. In this case the initial guess cannot be the vector of 0s. Memory is provided and
- owned by the caller.
- * @param[in] personalization_subset_size (optional) Supported on single-GPU, on the roadmap for
- Multi-GPU. The number of vertices for to personalize. Initialized to 0 by default.
- * @param[in] personalization_subset (optional) Supported on single-GPU, on the roadmap for
- Multi-GPU..= Array of size personalization_subset_size containing vertices for running personalized
- pagerank. Initialized to nullptr by default. Memory is provided and owned by the caller.
- * @param[in] personalization_values (optional) Supported on single-GPU, on the roadmap for
- Multi-GPU. Array of size personalization_subset_size containing values associated with
- personalization_subset vertices. Initialized to nullptr by default. Memory is provided and owned by
- the caller.
- * @param[in] tolerance              Supported on single-GPU. Set the tolerance the approximation,
- this parameter should be a small magnitude value.
- *                                   The lower the tolerance the better the approximation. If this
- value is 0.0f, cuGraph will use the default value which is 1.0E-5.
- *                                   Setting too small a tolerance can lead to non-convergence due
- to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable.
- * @param[in] max_iter               (optional) The maximum number of iterations before an answer is
- returned. This can be used to limit the execution time and do an early exit before the solver
- reaches the convergence tolerance.
- *                                   If this value is lower or equal to 0 cuGraph will use the
- default value, which is 500.
- * @param[in] has_guess              (optional) Supported on single-GPU. This parameter is used to
- notify cuGraph if it should use a user-provided initial guess. False means the user does not have a
- guess, in this case cuGraph will use a uniform vector set to 1/V.
- *                                   If the value is True, cuGraph will read the pagerank parameter
- and use this as an initial guess.
- * @param[out] *pagerank             The PageRank : pagerank[i] is the PageRank of vertex i. Memory
- remains provided and owned by the caller.
- *
- */
-template <typename VT, typename ET, typename WT>
-void pagerank(raft::handle_t const &handle,
-              GraphCSCView<VT, ET, WT> const &graph,
-              WT *pagerank,
-              VT personalization_subset_size = 0,
-              VT *personalization_subset     = nullptr,
-              WT *personalization_values     = nullptr,
-              double alpha                   = 0.85,
-              double tolerance               = 1e-5,
-              int64_t max_iter               = 500,
-              bool has_guess                 = false);
+namespace cugraph {
 
 /**
  * @brief     Compute jaccard similarity coefficient for all vertices
@@ -113,7 +45,7 @@ void pagerank(raft::handle_t const &handle,
  * caller
  */
 template <typename VT, typename ET, typename WT>
-void jaccard(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *result);
+void jaccard(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result);
 
 /**
  * @brief     Compute jaccard similarity coefficient for selected vertex pairs
@@ -137,12 +69,12 @@ void jaccard(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *resul
  * caller
  */
 template <typename VT, typename ET, typename WT>
-void jaccard_list(GraphCSRView<VT, ET, WT> const &graph,
-                  WT const *weights,
+void jaccard_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
+                  WT const* weights,
                   ET num_pairs,
-                  VT const *first,
-                  VT const *second,
-                  WT *result);
+                  VT const* first,
+                  VT const* second,
+                  WT* result);
 
 /**
  * @brief     Compute overlap coefficient for all vertices in the graph
@@ -163,7 +95,7 @@ void jaccard_list(GraphCSRView<VT, ET, WT> const &graph,
  * caller
  */
 template <typename VT, typename ET, typename WT>
-void overlap(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *result);
+void overlap(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result);
 
 /**
  * @brief     Compute overlap coefficient for select pairs of vertices
@@ -187,12 +119,12 @@ void overlap(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *resul
  * caller
  */
 template <typename VT, typename ET, typename WT>
-void overlap_list(GraphCSRView<VT, ET, WT> const &graph,
-                  WT const *weights,
+void overlap_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
+                  WT const* weights,
                   ET num_pairs,
-                  VT const *first,
-                  VT const *second,
-                  WT *result);
+                  VT const* first,
+                  VT const* second,
+                  WT* result);
 
 /**
  *
@@ -210,6 +142,8 @@ void overlap_list(GraphCSRView<VT, ET, WT> const &graph,
  * @tparam weight_t                                   Type of edge weights. Supported values : float
  * or double.
  *
+ * @param[in] handle                          Library handle (RAFT). If a communicator is set in the
+ * handle, the multi GPU version will be selected.
  * @param[in] graph                             cuGraph graph descriptor, should contain the
  * connectivity information as a COO. Graph is considered undirected. Edge weights are used for this
  * algorithm and set to 1 by default.
@@ -246,11 +180,12 @@ void overlap_list(GraphCSRView<VT, ET, WT> const &graph,
  *
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
-                  float *pos,
+void force_atlas2(raft::handle_t const& handle,
+                  legacy::GraphCOOView<vertex_t, edge_t, weight_t>& graph,
+                  float* pos,
                   const int max_iter                            = 500,
-                  float *x_start                                = nullptr,
-                  float *y_start                                = nullptr,
+                  float* x_start                                = nullptr,
+                  float* y_start                                = nullptr,
                   bool outbound_attraction_distribution         = true,
                   bool lin_log_mode                             = false,
                   bool prevent_overlapping                      = false,
@@ -262,7 +197,45 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                   bool strong_gravity_mode                      = false,
                   const float gravity                           = 1.0,
                   bool verbose                                  = false,
-                  internals::GraphBasedDimRedCallback *callback = nullptr);
+                  internals::GraphBasedDimRedCallback* callback = nullptr);
+
+/**
+ * @brief Finds an approximate solution to the traveling salesperson problem (TSP).
+ *        cuGraph computes an approximation of the TSP problem using hill climbing
+ *        optimization.
+ *
+ *        The current implementation does not support a weighted graph.
+ *
+ * @throws                                    cugraph::logic_error when an error occurs.
+ * @param[in] handle                          Library handle (RAFT). If a communicator is set in the
+ * handle, the multi GPU version will be selected.
+ * @param[in] vtx_ptr                         Device array containing the vertex identifiers used
+ * to initialize the route.
+ * @param[in] x_pos                           Device array containing starting x-axis positions.
+ * @param[in] y_pos                           Device array containing starting y-axis positions.
+ * @param[in] nodes                           Number of cities.
+ * @param[in] restarts                        Number of starts to try. The more restarts,
+ * the better the solution will be approximated. The number of restarts depends on the problem
+ * size and should be kept low for instances above 2k cities.
+ * @param[in] beam_search                     Specify if the initial solution should use KNN
+ * for an approximation solution.
+ * @param[in] k                               Beam width to use in the search.
+ * @param[in] nstart                          Start from a specific position.
+ * @param[in] verbose                         Logs configuration and iterative improvement.
+ * @param[out] route                          Device array containing the returned route.
+ *
+ */
+float traveling_salesperson(raft::handle_t const& handle,
+                            int const* vtx_ptr,
+                            float const* x_pos,
+                            float const* y_pos,
+                            int nodes,
+                            int restarts,
+                            bool beam_search,
+                            int k,
+                            int nstart,
+                            bool verbose,
+                            int* route);
 
 /**
  * @brief     Compute betweenness centrality for a graph
@@ -302,14 +275,14 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
  *
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(const raft::handle_t &handle,
-                            GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                            result_t *result,
+void betweenness_centrality(const raft::handle_t& handle,
+                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                            result_t* result,
                             bool normalized          = true,
                             bool endpoints           = false,
-                            weight_t const *weight   = nullptr,
+                            weight_t const* weight   = nullptr,
                             vertex_t k               = 0,
-                            vertex_t const *vertices = nullptr);
+                            vertex_t const* vertices = nullptr);
 
 /**
  * @brief     Compute edge betweenness centrality for a graph
@@ -346,13 +319,13 @@ void betweenness_centrality(const raft::handle_t &handle,
  *
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(const raft::handle_t &handle,
-                                 GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                 result_t *result,
+void edge_betweenness_centrality(const raft::handle_t& handle,
+                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                 result_t* result,
                                  bool normalized          = true,
-                                 weight_t const *weight   = nullptr,
+                                 weight_t const* weight   = nullptr,
                                  vertex_t k               = 0,
-                                 vertex_t const *vertices = nullptr);
+                                 vertex_t const* vertices = nullptr);
 
 enum class cugraph_cc_t {
   CUGRAPH_WEAK = 0,  ///> Weakly Connected Components
@@ -390,9 +363,9 @@ enum class cugraph_cc_t {
  * associated with vertex id i.
  */
 template <typename VT, typename ET, typename WT>
-void connected_components(GraphCSRView<VT, ET, WT> const &graph,
+void connected_components(legacy::GraphCSRView<VT, ET, WT> const& graph,
                           cugraph_cc_t connectivity_type,
-                          VT *labels);
+                          VT* labels);
 
 /**
  * @brief     Compute k truss for a graph
@@ -419,10 +392,10 @@ void connected_components(GraphCSRView<VT, ET, WT> const &graph,
  *
  */
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> k_truss_subgraph(
-  GraphCOOView<VT, ET, WT> const &graph,
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> k_truss_subgraph(
+  legacy::GraphCOOView<VT, ET, WT> const& graph,
   int k,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief        Compute the Katz centrality for the nodes of the graph G
@@ -457,8 +430,8 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> k_truss_subgraph(
  * @param[in] normalized             If True normalize the resulting katz centrality values
  */
 template <typename VT, typename ET, typename WT, typename result_t>
-void katz_centrality(GraphCSRView<VT, ET, WT> const &graph,
-                     result_t *result,
+void katz_centrality(legacy::GraphCSRView<VT, ET, WT> const& graph,
+                     result_t* result,
                      double alpha,
                      int max_iter,
                      double tol,
@@ -475,7 +448,7 @@ void katz_centrality(GraphCSRView<VT, ET, WT> const &graph,
  */
 /* ----------------------------------------------------------------------------*/
 template <typename VT, typename ET, typename WT>
-void core_number(GraphCSRView<VT, ET, WT> const &graph, VT *core_number);
+void core_number(legacy::GraphCSRView<VT, ET, WT> const& graph, VT* core_number);
 
 /**
  * @brief   Compute K Core of the graph G
@@ -499,13 +472,13 @@ void core_number(GraphCSRView<VT, ET, WT> const &graph, VT *core_number);
  * @param[out] out_graph             Unique pointer to K Core subgraph in COO format
  */
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> k_core(
-  GraphCOOView<VT, ET, WT> const &graph,
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> k_core(
+  legacy::GraphCOOView<VT, ET, WT> const& graph,
   int k,
-  VT const *vertex_id,
-  VT const *core_number,
+  VT const* vertex_id,
+  VT const* core_number,
   VT num_vertex_ids,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief      Find all 2-hop neighbors in the graph
@@ -525,7 +498,8 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> k_core(
  * @return                  Graph in COO format
  */
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> get_two_hop_neighbors(GraphCSRView<VT, ET, WT> const &graph);
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> get_two_hop_neighbors(
+  legacy::GraphCSRView<VT, ET, WT> const& graph);
 
 /**
  * @Synopsis   Performs a single source shortest path traversal of a graph starting from a vertex.
@@ -552,13 +526,13 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> get_two_hop_neighbors(GraphCSRView<VT, ET,
  *
  */
 template <typename VT, typename ET, typename WT>
-void sssp(GraphCSRView<VT, ET, WT> const &graph,
-          WT *distances,
-          VT *predecessors,
+void sssp(legacy::GraphCSRView<VT, ET, WT> const& graph,
+          WT* distances,
+          VT* predecessors,
           const VT source_vertex);
 
 // FIXME: Internally distances is of int (signed 32-bit) data type, but current
-// template uses data from VT, ET, WT from he GraphCSR View even if weights
+// template uses data from VT, ET, WT from the legacy::GraphCSR View even if weights
 // are not considered
 /**
  * @Synopsis   Performs a breadth first search traversal of a graph starting from a vertex.
@@ -593,11 +567,11 @@ void sssp(GraphCSRView<VT, ET, WT> const &graph,
  *
  */
 template <typename VT, typename ET, typename WT>
-void bfs(raft::handle_t const &handle,
-         GraphCSRView<VT, ET, WT> const &graph,
-         VT *distances,
-         VT *predecessors,
-         double *sp_counters,
+void bfs(raft::handle_t const& handle,
+         legacy::GraphCSRView<VT, ET, WT> const& graph,
+         VT* distances,
+         VT* predecessors,
+         double* sp_counters,
          const VT start_vertex,
          bool directed = true,
          bool mg_batch = false);
@@ -622,16 +596,50 @@ void bfs(raft::handle_t const &handle,
  * @param[in]  graph                 cuGRAPH COO graph
  * @param[in]  num_workers           number of vertices in the worker set
  * @param[in]  workers               device pointer to an array of worker vertex ids
- * @param[out] assignment            device pointer to an array to which the assignment will be
+ * @param[out] assignments           device pointer to an array to which the assignment will be
  * written. The array should be num_workers long, and will identify which vertex id (job) is
  * assigned to that worker
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t hungarian(raft::handle_t const &handle,
-                   GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
+weight_t hungarian(raft::handle_t const& handle,
+                   legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
                    vertex_t num_workers,
-                   vertex_t const *workers,
-                   vertex_t *assignment);
+                   vertex_t const* workers,
+                   vertex_t* assignments);
+
+/**
+ * @brief      Compute Hungarian algorithm on a weighted bipartite graph
+ *
+ * The Hungarian algorithm computes an assigment of "jobs" to "workers".  This function accepts
+ * a weighted graph and a vertex list identifying the "workers".  The weights in the weighted
+ * graph identify the cost of assigning a particular job to a worker.  The algorithm computes
+ * a minimum cost assignment and returns the cost as well as a vector identifying the assignment.
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam vertex_t                  Type of vertex identifiers. Supported value : int (signed,
+ * 32-bit)
+ * @tparam edge_t                    Type of edge identifiers.  Supported value : int (signed,
+ * 32-bit)
+ * @tparam weight_t                  Type of edge weights. Supported values : float or double.
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  graph                 cuGRAPH COO graph
+ * @param[in]  num_workers           number of vertices in the worker set
+ * @param[in]  workers               device pointer to an array of worker vertex ids
+ * @param[out] assignments           device pointer to an array to which the assignment will be
+ * written. The array should be num_workers long, and will identify which vertex id (job) is
+ * assigned to that worker
+ * @param[in]  epsilon               parameter to define precision of comparisons
+ *                                   in reducing weights to zero.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+weight_t hungarian(raft::handle_t const& handle,
+                   legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
+                   vertex_t num_workers,
+                   vertex_t const* workers,
+                   vertex_t* assignments,
+                   weight_t epsilon);
 
 /**
  * @brief      Louvain implementation
@@ -646,7 +654,7 @@ weight_t hungarian(raft::handle_t const &handle,
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam     graph_t               Type of graph
+ * @tparam     graph_view_t          Type of graph
  *
  * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
  * @param[in]  graph                 input graph object (CSR)
@@ -663,13 +671,74 @@ weight_t hungarian(raft::handle_t const &handle,
  *                                     2) modularity of the returned clustering
  *
  */
-template <typename graph_t>
-std::pair<size_t, typename graph_t::weight_type> louvain(
-  raft::handle_t const &handle,
-  graph_t const &graph,
-  typename graph_t::vertex_type *clustering,
-  size_t max_level                         = 100,
-  typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
+template <typename graph_view_t>
+std::pair<size_t, typename graph_view_t::weight_type> louvain(
+  raft::handle_t const& handle,
+  graph_view_t const& graph_view,
+  typename graph_view_t::vertex_type* clustering,
+  size_t max_level                              = 100,
+  typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1});
+
+/**
+ * @brief      Louvain implementation, returning dendrogram
+ *
+ * Compute a clustering of the graph by maximizing modularity
+ *
+ * Computed using the Louvain method described in:
+ *
+ *    VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of
+ *    community hierarchies in large networks, J Stat Mech P10008 (2008),
+ *    http://arxiv.org/abs/0803.0476
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam     graph_view_t          Type of graph
+ *
+ * @param[in]  handle                Library handle (RAFT)
+ * @param[in]  graph_view            Input graph view object (CSR)
+ * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
+ * @param[in]  resolution            (optional) The value of the resolution parameter to use.
+ *                                   Called gamma in the modularity formula, this changes the size
+ *                                   of the communities.  Higher resolutions lead to more smaller
+ *                                   communities, lower resolutions lead to fewer larger
+ *                                   communities. (default 1)
+ *
+ * @return                           a pair containing:
+ *                                     1) unique pointer to dendrogram
+ *                                     2) modularity of the returned clustering
+ *
+ */
+template <typename graph_view_t>
+std::pair<std::unique_ptr<Dendrogram<typename graph_view_t::vertex_type>>,
+          typename graph_view_t::weight_type>
+louvain(raft::handle_t const& handle,
+        graph_view_t const& graph_view,
+        size_t max_level                              = 100,
+        typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1});
+
+/**
+ * @brief      Flatten a Dendrogram at a particular level
+ *
+ * A Dendrogram represents a hierarchical clustering/partitioning of
+ * a graph.  This function will flatten the hierarchical clustering into
+ * a label for each vertex representing the final cluster/partition to
+ * which it is assigned
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam     graph_view_t          Type of graph
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  graph                 input graph object
+ * @param[in]  dendrogram            input dendrogram object
+ * @param[out] clustering            Pointer to device array where the clustering should be stored
+ *
+ */
+template <typename graph_view_t>
+void flatten_dendrogram(raft::handle_t const& handle,
+                        graph_view_t const& graph_view,
+                        Dendrogram<typename graph_view_t::vertex_type> const& dendrogram,
+                        typename graph_view_t::vertex_type* clustering);
 
 /**
  * @brief      Leiden implementation
@@ -706,9 +775,9 @@ std::pair<size_t, typename graph_t::weight_type> louvain(
  *                                     2) modularity of the returned clustering
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
-                                   GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                   vertex_t *clustering,
+std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
+                                   legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                   vertex_t* clustering,
                                    size_t max_iter     = 100,
                                    weight_t resolution = weight_t{1});
 
@@ -738,11 +807,11 @@ std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
  * written
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-void ecg(raft::handle_t const &handle,
-         GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void ecg(raft::handle_t const& handle,
+         legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
          weight_t min_weight,
          vertex_t ensemble_size,
-         vertex_t *clustering);
+         vertex_t* clustering);
 
 /**
  * @brief Generate edges in a minimum spanning forest of an undirected weighted graph.
@@ -765,10 +834,10 @@ void ecg(raft::handle_t const &handle,
  * @return out_graph             Unique pointer to MSF subgraph in COO format
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<GraphCOO<vertex_t, edge_t, weight_t>> minimum_spanning_tree(
-  raft::handle_t const &handle,
-  GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<legacy::GraphCOO<vertex_t, edge_t, weight_t>> minimum_spanning_tree(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 namespace triangle {
 /**
@@ -787,7 +856,7 @@ namespace triangle {
  * @return                           The number of triangles
  */
 template <typename VT, typename ET, typename WT>
-uint64_t triangle_count(GraphCSRView<VT, ET, WT> const &graph);
+uint64_t triangle_count(legacy::GraphCSRView<VT, ET, WT> const& graph);
 }  // namespace triangle
 
 namespace subgraph {
@@ -812,9 +881,9 @@ namespace subgraph {
  * @param[out] result                a graph in COO format containing the edges in the subgraph
  */
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, ET, WT> const &graph,
-                                                              VT const *vertices,
-                                                              VT num_vertices);
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> extract_subgraph_vertex(
+  legacy::GraphCOOView<VT, ET, WT> const& graph, VT const* vertices, VT num_vertices);
+}  // namespace subgraph
 
 /**
  * @brief     Wrapper function for Nvgraph balanced cut clustering
@@ -837,18 +906,17 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, E
  * @param[out] clustering            Pointer to device memory where the resulting clustering will
  * be stored
  */
-}  // namespace subgraph
 
 namespace ext_raft {
 template <typename VT, typename ET, typename WT>
-void balancedCutClustering(GraphCSRView<VT, ET, WT> const &graph,
+void balancedCutClustering(legacy::GraphCSRView<VT, ET, WT> const& graph,
                            VT num_clusters,
                            VT num_eigen_vects,
                            WT evs_tolerance,
                            int evs_max_iter,
                            WT kmean_tolerance,
                            int kmean_max_iter,
-                           VT *clustering);
+                           VT* clustering);
 
 /**
  * @brief      Wrapper function for Nvgraph spectral modularity maximization algorithm
@@ -872,14 +940,14 @@ void balancedCutClustering(GraphCSRView<VT, ET, WT> const &graph,
  * be stored
  */
 template <typename VT, typename ET, typename WT>
-void spectralModularityMaximization(GraphCSRView<VT, ET, WT> const &graph,
+void spectralModularityMaximization(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                     VT n_clusters,
                                     VT n_eig_vects,
                                     WT evs_tolerance,
                                     int evs_max_iter,
                                     WT kmean_tolerance,
                                     int kmean_max_iter,
-                                    VT *clustering);
+                                    VT* clustering);
 
 /**
  * @brief      Wrapper function for Nvgraph clustering modularity metric
@@ -898,10 +966,10 @@ void spectralModularityMaximization(GraphCSRView<VT, ET, WT> const &graph,
  * @param[out] score                 Pointer to a float in which the result will be written
  */
 template <typename VT, typename ET, typename WT>
-void analyzeClustering_modularity(GraphCSRView<VT, ET, WT> const &graph,
+void analyzeClustering_modularity(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                   int n_clusters,
-                                  VT const *clustering,
-                                  WT *score);
+                                  VT const* clustering,
+                                  WT* score);
 
 /**
  * @brief      Wrapper function for Nvgraph clustering edge cut metric
@@ -920,10 +988,10 @@ void analyzeClustering_modularity(GraphCSRView<VT, ET, WT> const &graph,
  * @param[out] score                 Pointer to a float in which the result will be written
  */
 template <typename VT, typename ET, typename WT>
-void analyzeClustering_edge_cut(GraphCSRView<VT, ET, WT> const &graph,
+void analyzeClustering_edge_cut(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                 int n_clusters,
-                                VT const *clustering,
-                                WT *score);
+                                VT const* clustering,
+                                WT* score);
 
 /**
  * @brief      Wrapper function for Nvgraph clustering ratio cut metric
@@ -942,10 +1010,10 @@ void analyzeClustering_edge_cut(GraphCSRView<VT, ET, WT> const &graph,
  * @param[out] score                 Pointer to a float in which the result will be written
  */
 template <typename VT, typename ET, typename WT>
-void analyzeClustering_ratio_cut(GraphCSRView<VT, ET, WT> const &graph,
+void analyzeClustering_ratio_cut(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                  int n_clusters,
-                                 VT const *clustering,
-                                 WT *score);
+                                 VT const* clustering,
+                                 WT* score);
 
 }  // namespace ext_raft
 
@@ -978,13 +1046,13 @@ namespace gunrock {
  *
  */
 template <typename VT, typename ET, typename WT>
-void hits(GraphCSRView<VT, ET, WT> const &graph,
+void hits(legacy::GraphCSRView<VT, ET, WT> const& graph,
           int max_iter,
           WT tolerance,
-          WT const *starting_value,
+          WT const* starting_value,
           bool normalized,
-          WT *hubs,
-          WT *authorities);
+          WT* hubs,
+          WT* authorities);
 
 }  // namespace gunrock
 
@@ -1007,16 +1075,48 @@ namespace dense {
  * @param[in]  costs                 pointer to array of costs, stored in row major order
  * @param[in]  num_rows              number of rows in dense matrix
  * @param[in]  num_cols              number of cols in dense matrix
- * @param[out] assignment            device pointer to an array to which the assignment will be
+ * @param[out] assignments           device pointer to an array to which the assignment will be
+ *                                   written. The array should be num_cols long, and will identify
+ *                                   which vertex id (job) is assigned to that worker
+ */
+template <typename vertex_t, typename weight_t>
+weight_t hungarian(raft::handle_t const& handle,
+                   weight_t const* costs,
+                   vertex_t num_rows,
+                   vertex_t num_columns,
+                   vertex_t* assignments);
+
+/**
+ * @brief      Compute Hungarian algorithm on a weighted bipartite graph
+ *
+ * The Hungarian algorithm computes an assigment of "jobs" to "workers".  This function accepts
+ * a weighted graph and a vertex list identifying the "workers".  The weights in the weighted
+ * graph identify the cost of assigning a particular job to a worker.  The algorithm computes
+ * a minimum cost assignment and returns the cost as well as a vector identifying the assignment.
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam vertex_t                  Type of vertex identifiers. Supported value : int (signed,
+ * 32-bit)
+ * @tparam weight_t                  Type of edge weights. Supported values : float or double.
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  costs                 pointer to array of costs, stored in row major order
+ * @param[in]  num_rows              number of rows in dense matrix
+ * @param[in]  num_cols              number of cols in dense matrix
+ * @param[out] assignments           device pointer to an array to which the assignment will be
  *                                   written. The array should be num_cols long, and will identify
  *                                   which vertex id (job) is assigned to that worker
+ * @param[in]  epsilon               parameter to define precision of comparisons
+ *                                   in reducing weights to zero.
  */
 template <typename vertex_t, typename weight_t>
-weight_t hungarian(raft::handle_t const &handle,
-                   weight_t const *costs,
+weight_t hungarian(raft::handle_t const& handle,
+                   weight_t const* costs,
                    vertex_t num_rows,
                    vertex_t num_columns,
-                   vertex_t *assignment);
+                   vertex_t* assignments,
+                   weight_t epsilon);
 
 }  // namespace dense
 
@@ -1035,6 +1135,8 @@ namespace experimental {
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
@@ -1051,10 +1153,10 @@ namespace experimental {
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void bfs(raft::handle_t const &handle,
-         graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
-         vertex_t *distances,
-         vertex_t *predecessors,
+void bfs(raft::handle_t const& handle,
+         graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+         vertex_t* distances,
+         vertex_t* predecessors,
          vertex_t source_vertex,
          bool direction_optimizing = false,
          vertex_t depth_limit      = std::numeric_limits<vertex_t>::max(),
@@ -1073,6 +1175,8 @@ void bfs(raft::handle_t const &handle,
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
@@ -1084,10 +1188,10 @@ void bfs(raft::handle_t const &handle,
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void sssp(raft::handle_t const &handle,
-          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
-          weight_t *distances,
-          vertex_t *predecessors,
+void sssp(raft::handle_t const& handle,
+          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+          weight_t* distances,
+          vertex_t* predecessors,
           vertex_t source_vertex,
           weight_t cutoff         = std::numeric_limits<weight_t>::max(),
           bool do_expensive_check = false);
@@ -1105,20 +1209,22 @@ void sssp(raft::handle_t const &handle,
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam result_t Type of PageRank scores.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
- * @param adj_matrix_row_out_weight_sums Pointer to an array storing sums of out-going edge weights
- * for the vertices in the rows of the graph adjacency matrix (for re-use) or `nullptr`. If
- * `nullptr`, these values are freshly computed. Computing these values outsid this function reduces
- * the number of memoray allocations/deallocations and computing if a user repeatedly computes
- * PageRank scores using the same graph with different personalization vectors.
+ * @param precomputed_vertex_out_weight_sums Pointer to an array storing sums of out-going edge
+ * weights for the vertices (for re-use) or `std::nullopt`. If `std::nullopt`, these values are
+ * freshly computed. Computing these values outside this function reduces the number of memory
+ * allocations/deallocations and computing if a user repeatedly computes PageRank scores using the
+ * same graph with different personalization vectors.
  * @param personalization_vertices Pointer to an array storing personalization vertex identifiers
- * (compute personalized PageRank) or `nullptr` (compute general PageRank).
+ * (compute personalized PageRank) or `std::nullopt` (compute general PageRank).
  * @param personalization_values Pointer to an array storing personalization values for the vertices
- * in the personalization set. Relevant only if @p personalization_vertices is not `nullptr`.
+ * in the personalization set. Relevant only if @p personalization_vertices is not `std::nullopt`.
  * @param personalization_vector_size Size of the personalization set. If @personalization_vertices
- * is not `nullptr`, the sizes of the arrays pointed by @p personalization_vertices and @p
+ * is not `std::nullopt`, the sizes of the arrays pointed by @p personalization_vertices and @p
  * personalization_values should be @p personalization_vector_size.
  * @param pageranks Pointer to the output PageRank score array.
  * @param alpha PageRank damping factor.
@@ -1132,13 +1238,13 @@ void sssp(raft::handle_t const &handle,
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
-void pagerank(raft::handle_t const &handle,
-              graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
-              weight_t *adj_matrix_row_out_weight_sums,
-              vertex_t *personalization_vertices,
-              result_t *personalization_values,
-              vertex_t personalization_vector_size,
-              result_t *pageranks,
+void pagerank(raft::handle_t const& handle,
+              graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const& graph_view,
+              std::optional<weight_t const*> precomputed_vertex_out_weight_sums,
+              std::optional<vertex_t const*> personalization_vertices,
+              std::optional<result_t const*> personalization_values,
+              std::optional<vertex_t> personalization_vector_size,
+              result_t* pageranks,
               result_t alpha,
               result_t epsilon,
               size_t max_iterations   = 500,
@@ -1157,6 +1263,8 @@ void pagerank(raft::handle_t const &handle,
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam result_t Type of Katz Centrality scores.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
@@ -1180,10 +1288,10 @@ void pagerank(raft::handle_t const &handle,
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
-void katz_centrality(raft::handle_t const &handle,
-                     graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
-                     result_t *betas,
-                     result_t *katz_centralities,
+void katz_centrality(raft::handle_t const& handle,
+                     graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const& graph_view,
+                     result_t const* betas,
+                     result_t* katz_centralities,
                      result_t alpha,
                      result_t beta,
                      result_t epsilon,
@@ -1191,6 +1299,96 @@ void katz_centrality(raft::handle_t const &handle,
                      bool has_initial_guess  = false,
                      bool normalize          = false,
                      bool do_expensive_check = false);
+/**
+ * @brief returns induced EgoNet subgraph(s) of neighbors centered at nodes in source_vertex within
+ * a given radius.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms. Must have at least one worker stream.
+ * @param graph_view Graph view object of, we extract induced egonet subgraphs from @p graph_view.
+ * @param source_vertex Pointer to egonet center vertices (size == @p n_subgraphs).
+ * @param n_subgraphs Number of induced EgoNet subgraphs to extract (ie. number of elements in @p
+ * source_vertex).
+ * @param radius  Include all neighbors of distance <= radius from @p source_vertex.
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>,
+ * rmm::device_uvector<weight_t>, rmm::device_uvector<size_t>> Quadraplet of edge source vertices,
+ * edge destination vertices, edge weights, and edge offsets for each induced EgoNet subgraph.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const& handle,
+            graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+            vertex_t* source_vertex,
+            vertex_t n_subgraphs,
+            vertex_t radius);
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Uniform distribution is assumed for the random engine.
+ *
+ * @tparam graph_t Type of graph/view (typically, graph_view_t).
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph (view )object to generate RW on.
+ * @param ptr_d_start Device pointer to set of starting vertex indices for the RW.
+ * @param num_paths = number(paths).
+ * @param max_depth maximum length of RWs.
+ * @param use_padding (optional) specifies if return uses padded format (true), or coalesced
+ * (compressed) format; when padding is used the output is a matrix of vertex paths and a matrix of
+ * edges paths (weights); in this case the matrices are stored in row major order; the vertex path
+ * matrix is padded with `num_vertices` values and the weight matrix is padded with `0` values;
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>,
+ * rmm::device_uvector<index_t>> Triplet of either padded or coalesced RW paths; in the coalesced
+ * case (default), the return consists of corresponding vertex and edge weights for each, and
+ * corresponding path sizes. This is meant to minimize the number of DF's to be passed to the Python
+ * layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes is
+ * represented as a 1D contiguous array. In the padded case the return is a matrix of num_paths x
+ * max_depth vertex paths; and num_paths x (max_depth-1) edge (weight) paths, with an empty array of
+ * sizes. Note: if the graph is un-weighted the edge (weight) paths consists of `weight_t{1}`
+ * entries;
+ */
+template <typename graph_t, typename index_t>
+std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
+           rmm::device_uvector<typename graph_t::weight_type>,
+           rmm::device_uvector<index_t>>
+random_walks(raft::handle_t const& handle,
+             graph_t const& graph,
+             typename graph_t::vertex_type const* ptr_d_start,
+             index_t num_paths,
+             index_t max_depth,
+             bool use_padding = false);
+
+/**
+ * @brief Finds (weakly-connected-)component IDs of each vertices in the input graph.
+ *
+ * The input graph must be symmetric. Component IDs can be arbitrary integers (they can be
+ * non-consecutive and are not ordered by component size or any other criterion).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * @param graph_view Graph view object.
+ * @param components Pointer to the output component ID array.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+  vertex_t* components,
+  bool do_expensive_check = false);
 
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/compute_partition.cuh b/cpp/include/cugraph/compute_partition.cuh
similarity index 79%
rename from cpp/include/compute_partition.cuh
rename to cpp/include/cugraph/compute_partition.cuh
index c81a6237b31..b8ad0fc19ab 100644
--- a/cpp/include/compute_partition.cuh
+++ b/cpp/include/cugraph/compute_partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 #include <thrust/binary_search.h>
 
-#include <experimental/graph.hpp>
+#include <cugraph/experimental/graph.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -39,27 +39,32 @@ class compute_partition_t {
   using graph_view_t = graph_view_type;
   using vertex_t     = typename graph_view_type::vertex_type;
 
-  compute_partition_t(graph_view_t const &graph_view)
+  compute_partition_t(raft::handle_t const& handle, graph_view_t const& graph_view)
+    : vertex_partition_offsets_v_(0, handle.get_stream())
   {
-    init<graph_view_t::is_multi_gpu>(graph_view);
+    init<graph_view_t::is_multi_gpu>(handle, graph_view);
   }
 
  private:
-  template <bool is_multi_gpu, typename std::enable_if_t<!is_multi_gpu> * = nullptr>
-  void init(graph_view_t const &graph_view)
+  template <bool is_multi_gpu, typename std::enable_if_t<!is_multi_gpu>* = nullptr>
+  void init(raft::handle_t const& handle, graph_view_t const& graph_view)
   {
   }
 
-  template <bool is_multi_gpu, typename std::enable_if_t<is_multi_gpu> * = nullptr>
-  void init(graph_view_t const &graph_view)
+  template <bool is_multi_gpu, typename std::enable_if_t<is_multi_gpu>* = nullptr>
+  void init(raft::handle_t const& handle, graph_view_t const& graph_view)
   {
     auto partition = graph_view.get_partition();
     row_size_      = partition.get_row_size();
     col_size_      = partition.get_col_size();
     size_          = row_size_ * col_size_;
 
-    vertex_partition_offsets_v_.resize(size_ + 1);
-    vertex_partition_offsets_v_ = partition.get_vertex_partition_offsets();
+    vertex_partition_offsets_v_.resize(size_ + 1, handle.get_stream());
+    auto vertex_partition_offsets = partition.get_vertex_partition_offsets();
+    raft::update_device(vertex_partition_offsets_v_.data(),
+                        vertex_partition_offsets.data(),
+                        vertex_partition_offsets.size(),
+                        handle.get_stream());
   }
 
  public:
@@ -76,7 +81,7 @@ class compute_partition_t {
    */
   class vertex_device_view_t {
    public:
-    vertex_device_view_t(vertex_t const *d_vertex_partition_offsets, int size)
+    vertex_device_view_t(vertex_t const* d_vertex_partition_offsets, int size)
       : d_vertex_partition_offsets_(d_vertex_partition_offsets), size_(size)
     {
     }
@@ -101,13 +106,13 @@ class compute_partition_t {
     }
 
    private:
-    vertex_t const *d_vertex_partition_offsets_;
+    vertex_t const* d_vertex_partition_offsets_;
     int size_;
   };
 
   class edge_device_view_t {
    public:
-    edge_device_view_t(vertex_t const *d_vertex_partition_offsets,
+    edge_device_view_t(vertex_t const* d_vertex_partition_offsets,
                        int row_size,
                        int col_size,
                        int size)
@@ -153,7 +158,7 @@ class compute_partition_t {
     }
 
    private:
-    vertex_t const *d_vertex_partition_offsets_;
+    vertex_t const* d_vertex_partition_offsets_;
     int row_size_;
     int col_size_;
     int size_;
@@ -166,7 +171,7 @@ class compute_partition_t {
    */
   vertex_device_view_t vertex_device_view() const
   {
-    return vertex_device_view_t(vertex_partition_offsets_v_.data().get(), size_);
+    return vertex_device_view_t(vertex_partition_offsets_v_.data(), size_);
   }
 
   /**
@@ -176,12 +181,11 @@ class compute_partition_t {
    */
   edge_device_view_t edge_device_view() const
   {
-    return edge_device_view_t(
-      vertex_partition_offsets_v_.data().get(), row_size_, col_size_, size_);
+    return edge_device_view_t(vertex_partition_offsets_v_.data(), row_size_, col_size_, size_);
   }
 
  private:
-  rmm::device_vector<vertex_t> vertex_partition_offsets_v_{};
+  rmm::device_uvector<vertex_t> vertex_partition_offsets_v_;
   int row_size_{1};
   int col_size_{1};
   int size_{1};
diff --git a/cpp/include/cugraph/dendrogram.hpp b/cpp/include/cugraph/dendrogram.hpp
new file mode 100644
index 00000000000..beebec4fd3f
--- /dev/null
+++ b/cpp/include/cugraph/dendrogram.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cugraph {
+
+template <typename vertex_t>
+class Dendrogram {
+ public:
+  void add_level(vertex_t first_index,
+                 vertex_t num_verts,
+                 rmm::cuda_stream_view stream_view,
+                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  {
+    level_ptr_.push_back(
+      std::make_unique<rmm::device_uvector<vertex_t>>(num_verts, stream_view, mr));
+    level_first_index_.push_back(first_index);
+  }
+
+  size_t current_level() const { return level_ptr_.size() - 1; }
+
+  size_t num_levels() const { return level_ptr_.size(); }
+
+  vertex_t const* get_level_ptr_nocheck(size_t level) const { return level_ptr_[level]->data(); }
+
+  vertex_t* get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); }
+
+  size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); }
+
+  vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; }
+
+  vertex_t const* current_level_begin() const { return get_level_ptr_nocheck(current_level()); }
+
+  vertex_t const* current_level_end() const { return current_level_begin() + current_level_size(); }
+
+  vertex_t* current_level_begin() { return get_level_ptr_nocheck(current_level()); }
+
+  vertex_t* current_level_end() { return current_level_begin() + current_level_size(); }
+
+  size_t current_level_size() const { return get_level_size_nocheck(current_level()); }
+
+  vertex_t current_level_first_index() const
+  {
+    return get_level_first_index_nocheck(current_level());
+  }
+
+ private:
+  std::vector<vertex_t> level_first_index_;
+  std::vector<std::unique_ptr<rmm::device_uvector<vertex_t>>> level_ptr_;
+};
+
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
new file mode 100644
index 00000000000..fcfd98db447
--- /dev/null
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace detail {
+
+/**
+ * @brief    Shuffle edgelist using the edge key function
+ *
+ * NOTE:  d_edgelist_rows, d_edgelist_cols and d_edgelist_weights
+ *        are modified within this function (data is sorted)
+ *        But the actual output is returned. The exact contents
+ *        of d_edgelist_rows, d_edgelist_cols and d_edgelist_weights
+ *        after the function is undefined.
+ *
+ * @tparam         vertex_t             vertex type
+ * @tparam         weight_t             weight type
+ *
+ * @param[in]      handle               raft handle
+ * @param[in/out]  d_edgelist_rows      vertex ids for row
+ * @param[in/out]  d_edgelist_cols      vertex ids for column
+ * @param[in/out]  d_edgelist_weights   optional edge weights
+ * @param[in]      store_transposed     true if operating on
+ *                                      transposed matrix
+ *
+ * @return tuple of shuffled rows, columns and optional weights
+ */
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+shuffle_edgelist_by_edge(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& d_edgelist_rows,
+                         rmm::device_uvector<vertex_t>& d_edgelist_cols,
+                         std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
+                         bool store_transposed);
+
+/**
+ * @brief    Shuffle vertices using the vertex key function
+ *
+ * NOTE:  d_value is modified within this function
+ *        (data is sorted).  But the actual output is returned.
+ *        The exact contents of d_value after the function is
+ *        undefined.
+ *
+ * @tparam         vertex_t      vertex type
+ *
+ * @param[in]      handle        raft handle
+ * @param[in/out]  d_vertices    vertex ids to shuffle
+ *
+ * @return device vector of shuffled vertices
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_vertices(raft::handle_t const& handle,
+                                               rmm::device_uvector<vertex_t>& d_vertices);
+
+/**
+ * @brief    Groupby and count edgelist using the edge key function
+ *
+ * NOTE:  d_edgelist_rows, d_edgelist_cols and d_edgelist_weights
+ *        are modified within this function (data is sorted)
+ *        But the actual output is returned. The exact contents
+ *        of d_edgelist_rows, d_edgelist_cols and d_edgelist_weights
+ *        after the function is undefined.
+ *
+ * @tparam         vertex_t             vertex type
+ * @tparam         weight_t             weight type
+ *
+ * @param[in]      handle               raft handle
+ * @param[in/out]  d_edgelist_rows      vertex ids for row
+ * @param[in/out]  d_edgelist_cols      vertex ids for column
+ * @param[in/out]  d_edgelist_weights   optional edge weights
+ *
+ * @return tuple of shuffled rows, columns and optional weights
+ */
+template <typename vertex_t, typename weight_t>
+rmm::device_uvector<size_t> groupby_and_count_by_edge(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>& d_edgelist_rows,
+  rmm::device_uvector<vertex_t>& d_edgelist_cols,
+  std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
+  size_t number_of_local_adj_matrix_partitions);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
new file mode 100644
index 00000000000..580ca00250a
--- /dev/null
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace detail {
+
+/**
+ * @brief    Fill a buffer with uniformly distributed random values
+ *
+ * Fills a buffer with uniformly distributed random values between
+ * the specified minimum and maximum values.
+ *
+ * @tparam      value_t      type of the value to operate on
+ *
+ * @param[in]   stream_view  stream view
+ * @param[out]  d_value      device array to fill
+ * @param[in]   size         number of elements in array
+ * @param[in]   min_value    minimum value
+ * @param[in]   max_value    maximum value
+ * @param[in]   seed         seed for initializing random number generator
+ *
+ */
+template <typename value_t>
+void uniform_random_fill(rmm::cuda_stream_view const& stream_view,
+                         value_t* d_value,
+                         size_t size,
+                         value_t min_value,
+                         value_t max_value,
+                         uint64_t seed);
+
+/**
+ * @brief    Fill a buffer with a sequence of values
+ *
+ * Fills the buffer with the sequence:
+ *   {start_value, start_value+1, start_value+2, ..., start_value+size-1}
+ *
+ * Similar to the function std::iota, wraps the function thrust::sequence
+ *
+ * @tparam      value_t      type of the value to operate on
+ *
+ * @param[in]   stream_view  stream view
+ * @param[out]  d_value      device array to fill
+ * @param[in]   size         number of elements in array
+ * @param[in]   start_value  starting value for sequence
+ *
+ */
+template <typename value_t>
+void sequence_fill(rmm::cuda_stream_view const& stream_view,
+                   value_t* d_value,
+                   size_t size,
+                   value_t start_value);
+
+/**
+ * @brief    Compute the maximum vertex id of an edge list
+ *
+ * max(d_edgelist_rows.max(), d_edgelist_cols.max())
+ *
+ * @tparam      vertex_t     vertex type
+ *
+ * @param[in]   stream_view  stream view
+ * @param[in]   d_edgelist_rows      device array to fill
+ * @param[in]   d_edgelist_cols         number of elements in array
+ *
+ * @param the maximum value occurring in the edge list
+ */
+template <typename vertex_t>
+vertex_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
+                                   rmm::device_uvector<vertex_t> const& d_edgelist_rows,
+                                   rmm::device_uvector<vertex_t> const& d_edgelist_cols);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/experimental/detail/graph_utils.cuh b/cpp/include/cugraph/experimental/detail/graph_utils.cuh
new file mode 100644
index 00000000000..02da9a80854
--- /dev/null
+++ b/cpp/include/cugraph/experimental/detail/graph_utils.cuh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/device_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+#include <thrust/transform.h>
+#include <cuco/detail/hash_functions.cuh>
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+// compute the numbers of nonzeros in rows (of the graph adjacency matrix, if store_transposed =
+// false) or columns (of the graph adjacency matrix, if store_transposed = true)
+template <typename vertex_t, typename edge_t>
+rmm::device_uvector<edge_t> compute_major_degrees(
+  raft::handle_t const& handle,
+  std::vector<edge_t const*> const& adj_matrix_partition_offsets,
+  std::optional<std::vector<vertex_t const*>> const& adj_matrix_partition_dcs_nzd_vertices,
+  std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts,
+  partition_t<vertex_t> const& partition,
+  std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets)
+{
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_rank = row_comm.get_rank();
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_rank = col_comm.get_rank();
+  auto const col_comm_size = col_comm.get_size();
+
+  auto use_dcs = adj_matrix_partition_dcs_nzd_vertices.has_value();
+
+  rmm::device_uvector<edge_t> local_degrees(0, handle.get_stream());
+  rmm::device_uvector<edge_t> degrees(0, handle.get_stream());
+
+  vertex_t max_num_local_degrees{0};
+  for (int i = 0; i < col_comm_size; ++i) {
+    auto vertex_partition_idx  = static_cast<size_t>(i * row_comm_size + row_comm_rank);
+    auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx);
+    max_num_local_degrees      = std::max(max_num_local_degrees, vertex_partition_size);
+    if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); }
+  }
+  local_degrees.resize(max_num_local_degrees, handle.get_stream());
+  for (int i = 0; i < col_comm_size; ++i) {
+    auto vertex_partition_idx = static_cast<size_t>(i * row_comm_size + row_comm_rank);
+    vertex_t major_first{};
+    vertex_t major_last{};
+    std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
+    auto p_offsets                    = adj_matrix_partition_offsets[i];
+    auto major_hypersparse_first =
+      use_dcs ? major_first + (*adj_matrix_partition_segment_offsets)
+                                [(detail::num_sparse_segments_per_vertex_partition + 2) * i +
+                                 detail::num_sparse_segments_per_vertex_partition]
+              : major_last;
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      thrust::make_counting_iterator(vertex_t{0}),
+                      thrust::make_counting_iterator(major_hypersparse_first - major_first),
+                      local_degrees.begin(),
+                      [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
+    if (use_dcs) {
+      auto p_dcs_nzd_vertices   = (*adj_matrix_partition_dcs_nzd_vertices)[i];
+      auto dcs_nzd_vertex_count = (*adj_matrix_partition_dcs_nzd_vertex_counts)[i];
+      thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   local_degrees.begin() + (major_hypersparse_first - major_first),
+                   local_degrees.begin() + (major_last - major_first),
+                   edge_t{0});
+      thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       thrust::make_counting_iterator(vertex_t{0}),
+                       thrust::make_counting_iterator(dcs_nzd_vertex_count),
+                       [p_offsets,
+                        p_dcs_nzd_vertices,
+                        major_first,
+                        major_hypersparse_first,
+                        local_degrees = local_degrees.data()] __device__(auto i) {
+                         auto d = p_offsets[(major_hypersparse_first - major_first) + i + 1] -
+                                  p_offsets[(major_hypersparse_first - major_first) + i];
+                         auto v                         = p_dcs_nzd_vertices[i];
+                         local_degrees[v - major_first] = d;
+                       });
+    }
+    col_comm.reduce(local_degrees.data(),
+                    i == col_comm_rank ? degrees.data() : static_cast<edge_t*>(nullptr),
+                    static_cast<size_t>(major_last - major_first),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
+  }
+
+  return degrees;
+}
+
+// compute the numbers of nonzeros in rows (of the graph adjacency matrix, if store_transposed =
+// false) or columns (of the graph adjacency matrix, if store_transposed = true)
+template <typename vertex_t, typename edge_t>
+rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
+                                                  edge_t const* offsets,
+                                                  vertex_t number_of_vertices)
+{
+  rmm::device_uvector<edge_t> degrees(number_of_vertices, handle.get_stream());
+  thrust::tabulate(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   degrees.begin(),
+                   degrees.end(),
+                   [offsets] __device__(auto i) { return offsets[i + 1] - offsets[i]; });
+  return degrees;
+}
+
+template <typename vertex_t>
+struct compute_gpu_id_from_vertex_t {
+  int comm_size{0};
+
+  __device__ int operator()(vertex_t v) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    return hash_func(v) % comm_size;
+  }
+};
+
+template <typename vertex_t>
+struct compute_gpu_id_from_edge_t {
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
+    return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+  }
+};
+
+template <typename vertex_t>
+struct compute_partition_id_from_edge_t {
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
+    return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size;
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/experimental/eidecl_graph.hpp b/cpp/include/cugraph/experimental/eidecl_graph.hpp
similarity index 99%
rename from cpp/include/experimental/eidecl_graph.hpp
rename to cpp/include/cugraph/experimental/eidecl_graph.hpp
index b8ac201008a..18e617c0993 100644
--- a/cpp/include/experimental/eidecl_graph.hpp
+++ b/cpp/include/cugraph/experimental/eidecl_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/experimental/eidir_graph.hpp b/cpp/include/cugraph/experimental/eidir_graph.hpp
similarity index 98%
rename from cpp/include/experimental/eidir_graph.hpp
rename to cpp/include/cugraph/experimental/eidir_graph.hpp
index 8998943ec16..93aa333dc5b 100644
--- a/cpp/include/experimental/eidir_graph.hpp
+++ b/cpp/include/cugraph/experimental/eidir_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/cugraph/experimental/graph.hpp
similarity index 50%
rename from cpp/include/experimental/graph.hpp
rename to cpp/include/cugraph/experimental/graph.hpp
index 592294c8967..9b0849c704f 100644
--- a/cpp/include/experimental/graph.hpp
+++ b/cpp/include/cugraph/experimental/graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,14 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cstddef>
+#include <optional>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -31,9 +32,9 @@ namespace experimental {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
 struct edgelist_t {
-  vertex_t const *p_src_vertices{nullptr};
-  vertex_t const *p_dst_vertices{nullptr};
-  weight_t const *p_edge_weights{nullptr};
+  vertex_t const* p_src_vertices{nullptr};
+  vertex_t const* p_dst_vertices{nullptr};
+  std::optional<weight_t const*> p_edge_weights{std::nullopt};
   edge_t number_of_edges{0};
 };
 
@@ -61,24 +62,44 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
   static constexpr bool is_adj_matrix_transposed = store_transposed;
   static constexpr bool is_multi_gpu             = multi_gpu;
 
-  graph_t(raft::handle_t const &handle,
-          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const &edge_lists,
-          partition_t<vertex_t> const &partition,
+  graph_t(raft::handle_t const& handle) : detail::graph_base_t<vertex_t, edge_t, weight_t>() {}
+
+  graph_t(raft::handle_t const& handle,
+          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
+          partition_t<vertex_t> const& partition,
           vertex_t number_of_vertices,
           edge_t number_of_edges,
           graph_properties_t properties,
-          bool sorted_by_global_degree_within_vertex_partition,
+          std::optional<std::vector<vertex_t>> const& segment_offsets,
           bool do_expensive_check = false);
 
-  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view()
+  bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); }
+
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view() const
   {
-    std::vector<edge_t const *> offsets(adj_matrix_partition_offsets_.size(), nullptr);
-    std::vector<vertex_t const *> indices(adj_matrix_partition_indices_.size(), nullptr);
-    std::vector<weight_t const *> weights(adj_matrix_partition_weights_.size(), nullptr);
+    std::vector<edge_t const*> offsets(adj_matrix_partition_offsets_.size(), nullptr);
+    std::vector<vertex_t const*> indices(adj_matrix_partition_indices_.size(), nullptr);
+    auto weights          = adj_matrix_partition_weights_
+                              ? std::make_optional<std::vector<weight_t const*>>(
+                         (*adj_matrix_partition_weights_).size(), nullptr)
+                              : std::nullopt;
+    auto dcs_nzd_vertices = adj_matrix_partition_dcs_nzd_vertices_
+                              ? std::make_optional<std::vector<vertex_t const*>>(
+                                  (*adj_matrix_partition_dcs_nzd_vertices_).size(), nullptr)
+                              : std::nullopt;
+    auto dcs_nzd_vertex_counts =
+      adj_matrix_partition_dcs_nzd_vertex_counts_
+        ? std::make_optional<std::vector<vertex_t>>(
+            (*adj_matrix_partition_dcs_nzd_vertex_counts_).size(), vertex_t{0})
+        : std::nullopt;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offsets[i] = adj_matrix_partition_offsets_[i].data();
       indices[i] = adj_matrix_partition_indices_[i].data();
-      if (weights.size() > 0) { weights[i] = adj_matrix_partition_weights_[i].data(); }
+      if (weights) { (*weights)[i] = (*adj_matrix_partition_weights_)[i].data(); }
+      if (dcs_nzd_vertices) {
+        (*dcs_nzd_vertices)[i]      = (*adj_matrix_partition_dcs_nzd_vertices_)[i].data();
+        (*dcs_nzd_vertex_counts)[i] = (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i];
+      }
     }
 
     return graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
@@ -86,26 +107,31 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
       offsets,
       indices,
       weights,
-      vertex_partition_segment_offsets_,
+      dcs_nzd_vertices,
+      dcs_nzd_vertex_counts,
       partition_,
       this->get_number_of_vertices(),
       this->get_number_of_edges(),
       this->get_graph_properties(),
-      vertex_partition_segment_offsets_.size() > 0,
+      adj_matrix_partition_segment_offsets_,
       false);
   }
 
  private:
   std::vector<rmm::device_uvector<edge_t>> adj_matrix_partition_offsets_{};
   std::vector<rmm::device_uvector<vertex_t>> adj_matrix_partition_indices_{};
-  std::vector<rmm::device_uvector<weight_t>> adj_matrix_partition_weights_{};
+  std::optional<std::vector<rmm::device_uvector<weight_t>>> adj_matrix_partition_weights_{
+    std::nullopt};
 
+  // nzd: nonzero (local) degree, relevant only if segment_offsets.size() > 0
+  std::optional<std::vector<rmm::device_uvector<vertex_t>>> adj_matrix_partition_dcs_nzd_vertices_{
+    std::nullopt};
+  std::optional<std::vector<vertex_t>> adj_matrix_partition_dcs_nzd_vertex_counts_{std::nullopt};
   partition_t<vertex_t> partition_{};
 
-  std::vector<vertex_t>
-    vertex_partition_segment_offsets_{};  // segment offsets within the vertex partition based on
-                                          // vertex degree, relevant only if
-                                          // sorted_by_global_degree_within_vertex_partition is true
+  // segment offsets within the vertex partition based on vertex degree, relevant only if
+  // segment_offsets.size() > 0
+  std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets_{std::nullopt};
 };
 
 // single-GPU version
@@ -123,36 +149,62 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
   static constexpr bool is_adj_matrix_transposed = store_transposed;
   static constexpr bool is_multi_gpu             = multi_gpu;
 
-  graph_t(raft::handle_t const &handle,
-          edgelist_t<vertex_t, edge_t, weight_t> const &edge_list,
+  graph_t(raft::handle_t const& handle)
+    : detail::graph_base_t<vertex_t, edge_t, weight_t>(),
+      offsets_(0, handle.get_stream()),
+      indices_(0, handle.get_stream()){};
+
+  graph_t(raft::handle_t const& handle,
+          edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
           vertex_t number_of_vertices,
           graph_properties_t properties,
-          bool sorted_by_degree,
+          std::optional<std::vector<vertex_t>> const& segment_offsets,
           bool do_expensive_check = false);
 
-  vertex_t get_number_of_local_vertices() const { return this->get_number_of_vertices(); }
+  bool is_weighted() const { return weights_.has_value(); }
 
-  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view()
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view() const
   {
     return graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       *(this->get_handle_ptr()),
       offsets_.data(),
       indices_.data(),
-      weights_.data(),
-      segment_offsets_,
+      weights_ ? std::optional<weight_t const*>{(*weights_).data()} : std::nullopt,
       this->get_number_of_vertices(),
       this->get_number_of_edges(),
       this->get_graph_properties(),
-      segment_offsets_.size() > 0,
+      segment_offsets_,
       false);
   }
 
  private:
+  friend class cugraph::serializer::serializer_t;
+
+  // cnstr. to be used _only_ for un/serialization purposes:
+  //
+  graph_t(raft::handle_t const& handle,
+          vertex_t number_of_vertices,
+          edge_t number_of_edges,
+          graph_properties_t properties,
+          rmm::device_uvector<edge_t>&& offsets,
+          rmm::device_uvector<vertex_t>&& indices,
+          std::optional<rmm::device_uvector<weight_t>>&& weights,
+          std::optional<std::vector<vertex_t>>&& segment_offsets)
+    : detail::graph_base_t<vertex_t, edge_t, weight_t>(
+        handle, number_of_vertices, number_of_edges, properties),
+      offsets_(std::move(offsets)),
+      indices_(std::move(indices)),
+      weights_(std::move(weights)),
+      segment_offsets_(std::move(segment_offsets))
+  {
+  }
+
   rmm::device_uvector<edge_t> offsets_;
   rmm::device_uvector<vertex_t> indices_;
-  rmm::device_uvector<weight_t> weights_;
-  std::vector<vertex_t> segment_offsets_{};  // segment offsets based on vertex degree, relevant
-                                             // only if sorted_by_global_degree is true
+  std::optional<rmm::device_uvector<weight_t>> weights_{std::nullopt};
+
+  // segment offsets based on vertex degree, relevant only if sorted_by_global_degree is true
+  std::optional<std::vector<vertex_t>> segment_offsets_{};
 };
 
 template <typename T, typename Enable = void>
@@ -180,6 +232,24 @@ template <typename edge_t>
 struct invalid_edge_id : invalid_idx<edge_t> {
 };
 
+template <typename vertex_t>
+struct invalid_component_id : invalid_idx<vertex_t> {
+};
+
+template <typename vertex_t>
+__host__ __device__ std::enable_if_t<std::is_signed<vertex_t>::value, bool> is_valid_vertex(
+  vertex_t num_vertices, vertex_t v)
+{
+  return (v >= 0) && (v < num_vertices);
+}
+
+template <typename vertex_t>
+__host__ __device__ std::enable_if_t<std::is_unsigned<vertex_t>::value, bool> is_valid_vertex(
+  vertex_t num_vertices, vertex_t v)
+{
+  return v < num_vertices;
+}
+
 }  // namespace experimental
 }  // namespace cugraph
 
diff --git a/cpp/include/cugraph/experimental/graph_functions.hpp b/cpp/include/cugraph/experimental/graph_functions.hpp
new file mode 100644
index 00000000000..10d3f6d2216
--- /dev/null
+++ b/cpp/include/cugraph/experimental/graph_functions.hpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief renumber edgelist (multi-GPU)
+ *
+ * This function assumes that vertices and edges are pre-shuffled to their target processes using
+ * the compute_gpu_id_from_vertex_t & compute_gpu_id_from_edge_t functors, respectively.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param optional_local_vertex_span If valid, part of the entire set of vertices in the graph to be
+ * renumbered. The first tuple element is the pointer to the array and the second tuple element is
+ * the size of the array. This parameter can be used to include isolated vertices. Applying the
+ * compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function to
+ * work (vertices should be pre-shuffled).
+ * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
+ * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored
+ * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
+ * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition
+ * assigned to this process).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t,
+ * std::vector<vertex_t>> Tuple of labels (vertex IDs before renumbering) for the entire set of
+ * vertices (assigned to this process in multi-GPU), partition_t object storing graph partitioning
+ * information, total number of vertices, total number of edges, and vertex partition segment
+ * offsets (a vertex partition is partitioned to multiple segments based on vertex degrees).
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>,
+                            partition_t<vertex_t>,
+                            vertex_t,
+                            edge_t,
+                            std::vector<vertex_t>>>
+renumber_edgelist(raft::handle_t const& handle,
+                  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_local_vertex_span,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
+                  bool do_expensive_check = false);
+
+/**
+ * @brief renumber edgelist (single-GPU)
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param optional_local_vertex_span If valid, vertices in the graph to be renumbered. The first
+ * tuple element is the pointer to the array and the second tuple element is the size of the array.
+ * This parameter can be used to include isolated vertices.
+ * @param vertices The entire set of vertices in the graph to be renumbered.
+ * @param num_vertices Number of vertices.
+ * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
+ * IDs are updated in-place ([INOUT] parameter).
+ * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
+ * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
+ * Vertex IDs are updated in-place ([INOUT] parameter).
+ * @param num_edgelist_edges Number of edges in the edgelist.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> Tuple of abels (vertex
+ * IDs before renumbering) for the entire set of vertices and vertex partition segment offsets (a
+ * vertex partition is partitioned to multiple segments based on vertex degrees).
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>>>
+renumber_edgelist(raft::handle_t const& handle,
+                  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool do_expensive_check = false);
+
+/**
+ * @brief Renumber external vertices to internal vertices based on the provoided @p
+ * renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id<vertex_t>::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the vertices to be renumbered. The input external vertices are
+ * renumbered to internal vertices in-place.
+ * @param num_vertices Number of vertices to be renumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, bool multi_gpu>
+void renumber_ext_vertices(raft::handle_t const& handle,
+                           vertex_t* vertices /* [INOUT] */,
+                           size_t num_vertices,
+                           vertex_t const* renumber_map_labels,
+                           vertex_t local_int_vertex_first,
+                           vertex_t local_int_vertex_last,
+                           bool do_expensive_check = false);
+
+/**
+ * @brief Unrenumber local internal vertices to external vertices based on the providied @p
+ * renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id<vertex_t>::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the local internal vertices to be unrenumbered. Each input element
+ * should be in [@p local_int_vertex_first, @p local_int_vertex_last). The input internal vertices
+ * are renumbered to external vertices in-place.
+ * @param num_vertices Number of vertices to be unrenumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t>
+void unrenumber_local_int_vertices(
+  raft::handle_t const& handle,
+  vertex_t* vertices /* [INOUT] */,
+  size_t num_vertices,
+  vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */,
+  vertex_t local_int_vertex_first,
+  vertex_t local_int_vertex_last,
+  bool do_expensive_check = false);
+
+// FIXME: We may add unrenumber_int_rows(or cols) as this will require communication only within a
+// sub-communicator and potentially be more efficient.
+/**
+ * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the
+ * providied @p renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id<vertex_t>::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the internal vertices to be unrenumbered. The input internal vertices
+ * are renumbered to external vertices in-place.
+ * @param num_vertices Number of vertices to be unrenumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param vertex_partition_lasts Last local internal vertices (exclusive, assigned to each process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, bool multi_gpu>
+void unrenumber_int_vertices(raft::handle_t const& handle,
+                             vertex_t* vertices /* [INOUT] */,
+                             size_t num_vertices,
+                             vertex_t const* renumber_map_labels,
+                             vertex_t local_int_vertex_first,
+                             vertex_t local_int_vertex_last,
+                             std::vector<vertex_t> const& vertex_partition_lasts,
+                             bool do_expensive_check = false);
+
+/**
+ * @brief Compute the coarsened graph.
+ *
+ * Aggregates the vertices with the same label to a new vertex in the output coarsened graph.
+ * Multi-edges in the coarsened graph are collapsed to a single edge with its weight equal to the
+ * sum of multi-edge weights.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam store_transposed Flag indicating whether to store the graph adjacency matrix as is or as
+ * transposed.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object of the input graph to be coarsened.
+ * @param labels Vertex labels (assigned to this process in multi-GPU) to be used in coarsening.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed,
+ * multi_gpu>>, rmm::device_uvector<vertex_t>> Tuple of the coarsened graph and labels mapped to the
+ * vertices (assigned to this process in multi-GPU) in the coarsened graph.
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+           rmm::device_uvector<vertex_t>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  vertex_t const* labels,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Relabel old labels to new labels.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param old_new_label_pairs Pairs of an old label and the corresponding new label (each process
+ * holds only part of the entire old labels and the corresponding new labels; partitioning can be
+ * arbitrary).
+ * @param num_label_pairs Number of (old, new) label pairs.
+ * @param labels Labels to be relabeled. This initially holds old labels. Old labels are updated to
+ * new labels in-place ([INOUT] parameter).
+ * @param num_labels Number of labels to be relabeled.
+ * @param skip_missing_labels Flag dictating the behavior on missing labels (@p labels contains old
+ * labels missing in @p old_new_label_pairs). If set to true, missing elements are skipped (not
+ * relabeled). If set to false, undefined behavior (if @p do_expensive_check is set to true, this
+ * function will throw an exception).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return rmm::device_uvector<vertex_t> New labels corresponding to the @p old_labels.
+ */
+template <typename vertex_t, bool multi_gpu>
+void relabel(raft::handle_t const& handle,
+             std::tuple<vertex_t const*, vertex_t const*> old_new_label_pairs,
+             vertex_t num_label_pairs,
+             vertex_t* labels /* [INOUT] */,
+             vertex_t num_labels,
+             bool skip_missing_labels,
+             bool do_expensive_check = false);
+
+/**
+ * @brief extract induced subgraph(s).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights.
+ * @tparam store_transposed Flag indicating whether to store the graph adjacency matrix as is or as
+ * transposed.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object, we extract induced subgraphs from @p graph_view.
+ * @param subgraph_offsets Pointer to subgraph vertex offsets (size == @p num_subgraphs + 1).
+ * @param subgraph_vertices Pointer to subgraph vertices (size == @p subgraph_offsets[@p
+ * num_subgraphs]). The elements of @p subgraph_vertices for each subgraph should be sorted in
+ * ascending order and unique.
+ * @param num_subgraphs Number of induced subgraphs to extract.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>,
+ * rmm::device_uvector<weight_t>, rmm::device_uvector<size_t>> Quadraplet of edge major (destination
+ * if @p store_transposed is true, source otherwise) vertices, edge minor (source if @p
+ * store_transposed  is true, destination otherwise) vertices, edge weights, and edge offsets for
+ * each induced subgraphs (size == num_subgraphs + 1). The sizes of the edge major & minor vertices
+ * are edge_offsets[num_subgraphs]. The size of the edge weights is either
+ * edge_offsets[num_subgraphs] (if @p graph_view is weighted) or 0 (if @p graph_view is unweighted).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           rmm::device_uvector<size_t>>
+extract_induced_subgraphs(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  size_t const* subgraph_offsets /* size == num_subgraphs + 1 */,
+  vertex_t const* subgraph_vertices /* size == subgraph_offsets[num_subgraphs] */,
+  size_t num_subgraphs,
+  bool do_expensive_check = false);
+
+/**
+ * @brief create a graph from (the optional vertex list and) the given edge list.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam store_transposed Flag indicating whether to store the graph adjacency matrix as is or as
+ * transposed.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param optional_vertex_span  If valid, part of the entire set of vertices in the graph to be
+ * renumbered. The first tuple element is the pointer to the array and the second tuple element is
+ * the size of the array. This parameter can be used to include isolated vertices. If multi-GPU,
+ * applying the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this
+ * function to work (vertices should be pre-shuffled).
+ * @param edgelist_rows Vector of edge row (source) vertex IDs.
+ * @param edgelist_cols Vector of edge column (destination) vertex IDs.
+ * @param edgelist_weights Vector of edge weights.
+ * @param graph_properties Properties of the graph represented by the input (optional vertex list
+ * and) edge list.
+ * @param renumber Flag indicating whether to renumber vertices or not.
+ * @return std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed,
+ * multi_gpu>, rmm::device_uvector<vertex_t>> Pair of the generated graph and the renumber map (if
+ * @p renumber is true) or std::nullopt (if @p renumber is false).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+create_graph_from_edgelist(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+  rmm::device_uvector<vertex_t>&& edgelist_rows,
+  rmm::device_uvector<vertex_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/cugraph/experimental/graph_view.hpp
similarity index 58%
rename from cpp/include/experimental/graph_view.hpp
rename to cpp/include/cugraph/experimental/graph_view.hpp
index ba327047b1d..272f02259e3 100644
--- a/cpp/include/experimental/graph_view.hpp
+++ b/cpp/include/cugraph/experimental/graph_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,13 @@
  */
 #pragma once
 
-#include <utilities/error.hpp>
+#include <cugraph/matrix_partition_view.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/vertex_partition_view.hpp>
+
+// visitor logic:
+//
+#include <cugraph/visitors/graph_envelope.hpp>
 
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
@@ -29,6 +35,9 @@
 #include <vector>
 
 namespace cugraph {
+namespace serializer {
+class serializer_t;  // forward...
+}
 namespace experimental {
 
 /**
@@ -40,32 +49,11 @@ namespace experimental {
  *
  * We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency
  * matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to
- * P linear partitions; each partition has the size close to V / P. We consider two different
- * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based
- * strategy (the latter is for future extension).
- * FIXME: in the future we may use the latter for both as this leads to simpler communication
- * patterns and better control over parallelism vs memory footprint trade-off.
- *
- * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be
- * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned
- * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V /
- * P_col.
- *
- * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one
- * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size *
- * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c =
- * vertex_partition_offsets[col_comm_size * row_comm_rank], and d =
- * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)].
+ * P linear partitions; each partition has the size close to V / P.
  *
- * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing
- * edge cuts across groups while balancing the number of vertices in each group. We will also
- * renumber vertices so the vertices in each group are mapped to consecutive integers. Then, there
- * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the
- * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does
- * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem,
- * the matrix is first horizontally partitioned to P slabs, then each slab will be further
- * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will
- * be responsible col_comm_size rectangular partitions in this case.
+ * The 2D graph adjacency matrix is first horizontally partitioned to P slabs, then each slab will
+ * be further vertically partitioned to P_row (instead of P_col in the default case) rectangles. One
+ * GPU will be responsible col_comm_size rectangular partitions.
  *
  * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for
  * col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i =
@@ -82,14 +70,14 @@ namespace experimental {
 template <typename vertex_t>
 class partition_t {
  public:
+  partition_t() = default;
+
   partition_t(std::vector<vertex_t> const& vertex_partition_offsets,
-              bool hypergraph_partitioned,
               int row_comm_size,
               int col_comm_size,
               int row_comm_rank,
               int col_comm_rank)
     : vertex_partition_offsets_(vertex_partition_offsets),
-      hypergraph_partitioned_(hypergraph_partitioned),
       comm_rank_(col_comm_rank * row_comm_size + row_comm_rank),
       row_comm_size_(row_comm_size),
       col_comm_size_(col_comm_size),
@@ -157,10 +145,7 @@ class partition_t {
            get_vertex_partition_first(vertex_partition_idx);
   }
 
-  size_t get_number_of_matrix_partitions() const
-  {
-    return hypergraph_partitioned_ ? col_comm_size_ : 1;
-  }
+  size_t get_number_of_matrix_partitions() const { return col_comm_size_; }
 
   // major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or
   // column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored).
@@ -173,16 +158,18 @@ class partition_t {
 
   vertex_t get_matrix_partition_major_first(size_t partition_idx) const
   {
-    return hypergraph_partitioned_
-             ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]
-             : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
+    return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_];
   }
 
   vertex_t get_matrix_partition_major_last(size_t partition_idx) const
   {
-    return hypergraph_partitioned_
-             ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]
-             : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
+    return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1];
+  }
+
+  vertex_t get_matrix_partition_major_size(size_t partition_idx) const
+  {
+    return get_matrix_partition_major_last(partition_idx) -
+           get_matrix_partition_major_first(partition_idx);
   }
 
   vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const
@@ -202,24 +189,21 @@ class partition_t {
 
   vertex_t get_matrix_partition_minor_first() const
   {
-    return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]
-                                   : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_];
+    return vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
   }
 
   vertex_t get_matrix_partition_minor_last() const
   {
-    return hypergraph_partitioned_
-             ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]
-             : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_];
+    return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
   }
 
-  // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph
-  // partitioning is applied or not
-  bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; }
+  vertex_t get_matrix_partition_minor_size() const
+  {
+    return get_matrix_partition_minor_last() - get_matrix_partition_minor_first();
+  }
 
  private:
   std::vector<vertex_t> vertex_partition_offsets_{};  // size = P + 1
-  bool hypergraph_partitioned_{false};
 
   int comm_rank_{0};
   int row_comm_size_{0};
@@ -238,15 +222,22 @@ struct graph_properties_t {
 
 namespace detail {
 
+using namespace cugraph::visitors;
+
 // FIXME: threshold values require tuning
+// use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller
+// than col_comm_size * hypersparse_threshold_ratio, should be less than 1.0
+double constexpr hypersparse_threshold_ratio = 0.5;
 size_t constexpr low_degree_threshold{raft::warp_size()};
 size_t constexpr mid_degree_threshold{1024};
-size_t constexpr num_segments_per_vertex_partition{3};
+size_t constexpr num_sparse_segments_per_vertex_partition{3};
 
 // Common for both graph_view_t & graph_t and both single-GPU & multi-GPU versions
 template <typename vertex_t, typename edge_t, typename weight_t>
-class graph_base_t {
+class graph_base_t : public graph_envelope_t::base_graph_t /*<- visitor logic*/ {
  public:
+  graph_base_t() = default;  // Note: required by visitor logic
+
   graph_base_t(raft::handle_t const& handle,
                vertex_t number_of_vertices,
                edge_t number_of_edges,
@@ -274,7 +265,14 @@ class graph_base_t {
   bool is_symmetric() const { return properties_.is_symmetric; }
   bool is_multigraph() const { return properties_.is_multigraph; }
 
+  void apply(visitor_t& v) const override  // <- visitor logic
+  {
+    v.visit_graph(*this);
+  }
+
  protected:
+  friend class cugraph::serializer::serializer_t;
+
   raft::handle_t const* get_handle_ptr() const { return handle_ptr_; };
   graph_properties_t get_graph_properties() const { return properties_; }
 
@@ -318,21 +316,21 @@ class graph_view_t<vertex_t,
   static constexpr bool is_adj_matrix_transposed = store_transposed;
   static constexpr bool is_multi_gpu             = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& adj_matrix_partition_offsets,
-               std::vector<vertex_t const*> const& adj_matrix_partition_indices,
-               std::vector<weight_t const*> const& adj_matrix_partition_weights,
-               std::vector<vertex_t> const& vertex_partition_segment_offsets,
-               partition_t<vertex_t> const& partition,
-               vertex_t number_of_vertices,
-               edge_t number_of_edges,
-               graph_properties_t properties,
-               bool sorted_by_global_degree_within_vertex_partition,
-               bool do_expensive_check = false);
-
-  bool is_weighted() const { return adj_matrix_partition_weights_.size() > 0; }
-
-  partition_t<vertex_t> get_partition() const { return partition_; }
+  graph_view_t(
+    raft::handle_t const& handle,
+    std::vector<edge_t const*> const& adj_matrix_partition_offsets,
+    std::vector<vertex_t const*> const& adj_matrix_partition_indices,
+    std::optional<std::vector<weight_t const*>> const& adj_matrix_partition_weights,
+    std::optional<std::vector<vertex_t const*>> const& adj_matrix_partition_dcs_nzd_vertices,
+    std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts,
+    partition_t<vertex_t> const& partition,
+    vertex_t number_of_vertices,
+    edge_t number_of_edges,
+    graph_properties_t properties,
+    std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+    bool do_expensive_check = false);
+
+  bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); }
 
   vertex_t get_number_of_local_vertices() const
   {
@@ -399,6 +397,29 @@ class graph_view_t<vertex_t,
     }
   }
 
+  edge_t get_number_of_local_adj_matrix_partition_edges(size_t adj_matrix_partition_idx) const
+  {
+    return adj_matrix_partition_number_of_edges_[adj_matrix_partition_idx];
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<transposed, vertex_t> get_local_adj_matrix_partition_row_first() const
+  {
+    return partition_.get_matrix_partition_minor_first();
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<transposed, vertex_t> get_local_adj_matrix_partition_row_last() const
+  {
+    return partition_.get_matrix_partition_minor_last();
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<transposed, vertex_t> get_number_of_local_adj_matrix_partition_rows() const
+  {
+    return get_local_adj_matrix_partition_row_last() - get_local_adj_matrix_partition_row_first();
+  }
+
   vertex_t get_local_adj_matrix_partition_row_first(size_t adj_matrix_partition_idx) const
   {
     return store_transposed ? partition_.get_matrix_partition_minor_first()
@@ -411,6 +432,12 @@ class graph_view_t<vertex_t,
                             : partition_.get_matrix_partition_major_last(adj_matrix_partition_idx);
   }
 
+  vertex_t get_number_of_local_adj_matrix_partition_rows(size_t adj_matrix_partition_idx) const
+  {
+    return get_local_adj_matrix_partition_row_last(adj_matrix_partition_idx) -
+           get_local_adj_matrix_partition_row_first(adj_matrix_partition_idx);
+  }
+
   vertex_t get_local_adj_matrix_partition_row_value_start_offset(
     size_t adj_matrix_partition_idx) const
   {
@@ -419,6 +446,24 @@ class graph_view_t<vertex_t,
              : partition_.get_matrix_partition_major_value_start_offset(adj_matrix_partition_idx);
   }
 
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<!transposed, vertex_t> get_local_adj_matrix_partition_col_first() const
+  {
+    return partition_.get_matrix_partition_minor_first();
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<!transposed, vertex_t> get_local_adj_matrix_partition_col_last() const
+  {
+    return partition_.get_matrix_partition_minor_last();
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<!transposed, vertex_t> get_number_of_local_adj_matrix_partition_cols() const
+  {
+    return get_local_adj_matrix_partition_col_last() - get_local_adj_matrix_partition_col_first();
+  }
+
   vertex_t get_local_adj_matrix_partition_col_first(size_t adj_matrix_partition_idx) const
   {
     return store_transposed ? partition_.get_matrix_partition_major_first(adj_matrix_partition_idx)
@@ -431,6 +476,12 @@ class graph_view_t<vertex_t,
                             : partition_.get_matrix_partition_minor_last();
   }
 
+  vertex_t get_number_of_local_adj_matrix_partition_cols(size_t adj_matrix_partition_idx) const
+  {
+    return get_local_adj_matrix_partition_col_last(adj_matrix_partition_idx) -
+           get_local_adj_matrix_partition_col_first(adj_matrix_partition_idx);
+  }
+
   vertex_t get_local_adj_matrix_partition_col_value_start_offset(
     size_t adj_matrix_partition_idx) const
   {
@@ -439,66 +490,86 @@ class graph_view_t<vertex_t,
              : vertex_t{0};
   }
 
-  bool is_hypergraph_partitioned() const { return partition_.is_hypergraph_partitioned(); }
-
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
-  edge_t const* offsets() const { return offsets(0); }
-
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
-  vertex_t const* indices() const { return indices(0); }
-
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
-  weight_t const* weights() const { return weights(0); }
-
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
-  edge_t const* offsets(size_t adj_matrix_partition_idx) const
+  std::optional<std::vector<vertex_t>> get_local_adj_matrix_partition_segment_offsets(
+    size_t partition_idx) const
   {
-    return adj_matrix_partition_offsets_[adj_matrix_partition_idx];
+    if (adj_matrix_partition_segment_offsets_) {
+      auto size_per_partition =
+        (*adj_matrix_partition_segment_offsets_).size() / partition_.get_col_size();
+      return std::vector<vertex_t>(
+        (*adj_matrix_partition_segment_offsets_).begin() + partition_idx * size_per_partition,
+        (*adj_matrix_partition_segment_offsets_).begin() +
+          (partition_idx + 1) * size_per_partition);
+    } else {
+      return std::nullopt;
+    }
   }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
-  vertex_t const* indices(size_t adj_matrix_partition_idx) const
+  vertex_partition_view_t<vertex_t, true> get_vertex_partition_view() const
   {
-    return adj_matrix_partition_indices_[adj_matrix_partition_idx];
+    return vertex_partition_view_t<vertex_t, true>(this->get_number_of_vertices(),
+                                                   this->get_local_vertex_first(),
+                                                   this->get_local_vertex_last());
   }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
-  weight_t const* weights(size_t adj_matrix_partition_idx) const
+  matrix_partition_view_t<vertex_t, edge_t, weight_t, true> get_matrix_partition_view(
+    size_t adj_matrix_partition_idx) const
   {
-    return adj_matrix_partition_weights_.size() > 0
-             ? adj_matrix_partition_weights_[adj_matrix_partition_idx]
-             : static_cast<weight_t const*>(nullptr);
-  }
+    return matrix_partition_view_t<vertex_t, edge_t, weight_t, true>(
+      adj_matrix_partition_offsets_[adj_matrix_partition_idx],
+      adj_matrix_partition_indices_[adj_matrix_partition_idx],
+      adj_matrix_partition_weights_
+        ? std::optional<weight_t const*>{(*adj_matrix_partition_weights_)[adj_matrix_partition_idx]}
+        : std::nullopt,
+      adj_matrix_partition_dcs_nzd_vertices_
+        ? std::optional<vertex_t const*>{(
+            *adj_matrix_partition_dcs_nzd_vertices_)[adj_matrix_partition_idx]}
+        : std::nullopt,
+      adj_matrix_partition_dcs_nzd_vertex_counts_
+        ? std::optional<vertex_t>{(
+            *adj_matrix_partition_dcs_nzd_vertex_counts_)[adj_matrix_partition_idx]}
+        : std::nullopt,
+      this->get_number_of_local_adj_matrix_partition_edges(adj_matrix_partition_idx),
+      store_transposed ? this->get_local_adj_matrix_partition_col_first(adj_matrix_partition_idx)
+                       : this->get_local_adj_matrix_partition_row_first(adj_matrix_partition_idx),
+      store_transposed ? this->get_local_adj_matrix_partition_col_last(adj_matrix_partition_idx)
+                       : this->get_local_adj_matrix_partition_row_last(adj_matrix_partition_idx),
+      store_transposed ? this->get_local_adj_matrix_partition_row_first(adj_matrix_partition_idx)
+                       : this->get_local_adj_matrix_partition_col_first(adj_matrix_partition_idx),
+      store_transposed ? this->get_local_adj_matrix_partition_row_last(adj_matrix_partition_idx)
+                       : this->get_local_adj_matrix_partition_col_last(adj_matrix_partition_idx),
+      store_transposed
+        ? this->get_local_adj_matrix_partition_col_value_start_offset(adj_matrix_partition_idx)
+        : this->get_local_adj_matrix_partition_row_value_start_offset(adj_matrix_partition_idx));
+  }
+
+  rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
+  rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
+
+  rmm::device_uvector<weight_t> compute_in_weight_sums(raft::handle_t const& handle) const;
+  rmm::device_uvector<weight_t> compute_out_weight_sums(raft::handle_t const& handle) const;
+
+  edge_t compute_max_in_degree(raft::handle_t const& handle) const;
+  edge_t compute_max_out_degree(raft::handle_t const& handle) const;
+
+  weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
+  weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
 
  private:
   std::vector<edge_t const*> adj_matrix_partition_offsets_{};
   std::vector<vertex_t const*> adj_matrix_partition_indices_{};
-  std::vector<weight_t const*> adj_matrix_partition_weights_{};
+  std::optional<std::vector<weight_t const*>> adj_matrix_partition_weights_{};
+
+  // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
+  std::optional<std::vector<vertex_t const*>> adj_matrix_partition_dcs_nzd_vertices_{};
+  std::optional<std::vector<vertex_t>> adj_matrix_partition_dcs_nzd_vertex_counts_{};
+
+  std::vector<edge_t> adj_matrix_partition_number_of_edges_{};
 
   partition_t<vertex_t> partition_{};
 
-  std::vector<vertex_t>
-    vertex_partition_segment_offsets_{};  // segment offsets within the vertex partition based on
-                                          // vertex degree, relevant only if
-                                          // sorted_by_global_degree_within_vertex_partition is true
+  // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
+  std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets_{};
 };
 
 // single-GPU version
@@ -524,15 +595,14 @@ class graph_view_t<vertex_t,
   graph_view_t(raft::handle_t const& handle,
                edge_t const* offsets,
                vertex_t const* indices,
-               weight_t const* weights,
-               std::vector<vertex_t> const& segment_offsets,
+               std::optional<weight_t const*> weights,
                vertex_t number_of_vertices,
                edge_t number_of_edges,
                graph_properties_t properties,
-               bool sorted_by_degree,
+               std::optional<std::vector<vertex_t>> const& segment_offsets,
                bool do_expensive_check = false);
 
-  bool is_weighted() const { return weights_ != nullptr; }
+  bool is_weighted() const { return weights_.has_value(); }
 
   vertex_t get_number_of_local_vertices() const { return this->get_number_of_vertices(); }
 
@@ -567,6 +637,30 @@ class graph_view_t<vertex_t,
     return this->get_number_of_vertices();
   }
 
+  edge_t get_number_of_local_adj_matrix_partition_edges(size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return this->get_number_of_edges();
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<transposed, vertex_t> get_local_adj_matrix_partition_row_first() const
+  {
+    return get_local_adj_matrix_partition_row_first(0);
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<transposed, vertex_t> get_local_adj_matrix_partition_row_last() const
+  {
+    return get_local_adj_matrix_partition_row_last(0);
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<transposed, vertex_t> get_number_of_local_adj_matrix_partition_rows() const
+  {
+    return get_number_of_local_adj_matrix_partition_rows(0);
+  }
+
   vertex_t get_local_adj_matrix_partition_row_first(size_t adj_matrix_partition_idx) const
   {
     assert(adj_matrix_partition_idx == 0);
@@ -586,6 +680,24 @@ class graph_view_t<vertex_t,
     return vertex_t{0};
   }
 
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<!transposed, vertex_t> get_local_adj_matrix_partition_col_first() const
+  {
+    return get_local_adj_matrix_partition_col_first(0);
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<!transposed, vertex_t> get_local_adj_matrix_partition_col_last() const
+  {
+    return get_local_adj_matrix_partition_col_last(0);
+  }
+
+  template <bool transposed = is_adj_matrix_transposed>
+  std::enable_if_t<!transposed, vertex_t> get_number_of_local_adj_matrix_partition_cols() const
+  {
+    return get_number_of_local_adj_matrix_partition_cols(0);
+  }
+
   vertex_t get_local_adj_matrix_partition_col_first(size_t adj_matrix_partition_idx) const
   {
     assert(adj_matrix_partition_idx == 0);
@@ -605,32 +717,45 @@ class graph_view_t<vertex_t,
     return vertex_t{0};
   }
 
-  bool is_hypergraph_partitioned() const { return false; }
+  std::optional<std::vector<vertex_t>> get_local_adj_matrix_partition_segment_offsets(
+    size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return segment_offsets_;
+  }
+
+  vertex_partition_view_t<vertex_t, false> get_vertex_partition_view() const
+  {
+    return vertex_partition_view_t<vertex_t, false>(this->get_number_of_vertices());
+  }
+
+  matrix_partition_view_t<vertex_t, edge_t, weight_t, false> get_matrix_partition_view(
+    size_t adj_matrix_partition_idx = 0) const
+  {
+    assert(adj_matrix_partition_idx == 0);  // there is only one matrix partition in single-GPU
+    return matrix_partition_view_t<vertex_t, edge_t, weight_t, false>(
+      offsets_, indices_, weights_, this->get_number_of_vertices(), this->get_number_of_edges());
+  }
+
+  rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
+  rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private.
-  edge_t const* offsets() const { return offsets_; }
+  rmm::device_uvector<weight_t> compute_in_weight_sums(raft::handle_t const& handle) const;
+  rmm::device_uvector<weight_t> compute_out_weight_sums(raft::handle_t const& handle) const;
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private.
-  vertex_t const* indices() const { return indices_; }
+  edge_t compute_max_in_degree(raft::handle_t const& handle) const;
+  edge_t compute_max_out_degree(raft::handle_t const& handle) const;
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
-  // accelerator implementation. This function is currently public to support the legacy
-  // implementations directly accessing CSR/CSC data, but this function will eventually become
-  // private.
-  weight_t const* weights() const { return weights_; }
+  weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
+  weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
 
  private:
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};
-  weight_t const* weights_{nullptr};
-  std::vector<vertex_t> segment_offsets_{};  // segment offsets based on vertex degree, relevant
-                                             // only if sorted_by_global_degree is true
+  std::optional<weight_t const*> weights_{std::nullopt};
+
+  // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
+  std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
 };
 
 }  // namespace experimental
diff --git a/cpp/include/functions.hpp b/cpp/include/cugraph/functions.hpp
similarity index 51%
rename from cpp/include/functions.hpp
rename to cpp/include/cugraph/functions.hpp
index ede1be3767f..23edd204c3b 100644
--- a/cpp/include/functions.hpp
+++ b/cpp/include/cugraph/functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <raft/handle.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <graph.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 namespace cugraph {
 
@@ -40,43 +40,9 @@ namespace cugraph {
  *
  */
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCSR<VT, ET, WT>> coo_to_csr(
-  GraphCOOView<VT, ET, WT> const &graph,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief    Renumber source and destination indices
- *
- * Renumber source and destination indexes to be a dense numbering,
- * using contiguous values between 0 and number of vertices minus 1.
- *
- * @throws                    cugraph::logic_error when an error occurs.
- *
- * @tparam VT_IN              type of vertex index input
- * @tparam VT_OUT             type of vertex index output
- * @tparam ET                 type of edge index
- *
- * @param[in]  number_of_edges number of edges in the graph
- * @param[in]  src            Pointer to device memory containing source vertex ids
- * @param[in]  dst            Pointer to device memory containing destination vertex ids
- * @param[out] src_renumbered Pointer to device memory containing the output source vertices.
- * @param[out] dst_renumbered Pointer to device memory containing the output destination vertices.
- * @param[out] map_size       Pointer to local memory containing the number of elements in the
- * renumbering map
- * @param[in]  mr             Memory resource used to allocate the returned graph
- *
- * @return                    Unique pointer to renumbering map
- *
- */
-template <typename VT_IN, typename VT_OUT, typename ET>
-std::unique_ptr<rmm::device_buffer> renumber_vertices(
-  ET number_of_edges,
-  VT_IN const *src,
-  VT_IN const *dst,
-  VT_OUT *src_renumbered,
-  VT_OUT *dst_renumbered,
-  ET *map_size,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<legacy::GraphCSR<VT, ET, WT>> coo_to_csr(
+  legacy::GraphCOOView<VT, ET, WT> const& graph,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief    Broadcast using handle communicator
@@ -94,7 +60,7 @@ std::unique_ptr<rmm::device_buffer> renumber_vertices(
 
 // FIXME: It would be better to expose it in RAFT
 template <typename value_t>
-void comms_bcast(const raft::handle_t &handle, value_t *value, size_t count)
+void comms_bcast(const raft::handle_t& handle, value_t* value, size_t count)
 {
   handle.get_comms().bcast(value, count, 0, handle.get_stream());
 }
diff --git a/cpp/include/cugraph/graph_generators.hpp b/cpp/include/cugraph/graph_generators.hpp
new file mode 100644
index 00000000000..94ae5d2cf81
--- /dev/null
+++ b/cpp/include/cugraph/graph_generators.hpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cstdint>
+#include <optional>
+#include <tuple>
+
+namespace cugraph {
+
+/**
+ * @brief generate an edge list for an R-mat graph.
+ *
+ * This function allows multi-edges and self-loops similar to the Graph 500 reference
+ * implementation.
+ *
+ * NOTE: The scramble_vertex_ids function needs to be called in order to generate a
+ * graph conforming to the Graph 500 specification (note that scrambling does not
+ * affect cuGraph's graph construction performance, so this is generally unnecessary).
+ * If `edge_factor` is given (e.g. Graph 500), set @p num_edges to
+ * (size_t{1} << @p scale) * `edge_factor`. To generate an undirected graph, set @p b == @p c and @p
+ * clip_and_flip = true. All the resulting edges will be placed in the lower triangular part
+ * (including the diagonal) of the graph adjacency matrix.
+ *
+ * For multi-GPU generation with `P` GPUs, @p seed should be set to different values in different
+ * GPUs to avoid every GPU generating the same set of edges. @p num_edges should be adjusted as
+ * well; e.g. assuming `edge_factor` is given, set @p num_edges = (size_t{1} << @p scale) *
+ * `edge_factor` / `P` + (rank < (((size_t{1} << @p scale) * `edge_factor`) % P) ? 1 : 0).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param scale Scale factor to set the number of verties in the graph. Vertex IDs have values in
+ * [0, V), where V = 1 << @p scale.
+ * @param num_edges Number of edges to generate.
+ * @param a a, b, c, d (= 1.0 - (a + b + c)) in the R-mat graph generator (vist https://graph500.org
+ * for additional details). a, b, c, d should be non-negative and a + b + c should be no larger
+ * than 1.0.
+ * @param b a, b, c, d (= 1.0 - (a + b + c)) in the R-mat graph generator (vist https://graph500.org
+ * for additional details). a, b, c, d should be non-negative and a + b + c should be no larger
+ * than 1.0.
+ * @param c a, b, c, d (= 1.0 - (a + b + c)) in the R-mat graph generator (vist https://graph500.org
+ * for additional details). a, b, c, d should be non-negative and a + b + c should be no larger
+ * than 1.0.
+ * @param seed Seed value for the random number generator.
+ * @param clip_and_flip Flag controlling whether to generate edges only in the lower triangular part
+ * (including the diagonal) of the graph adjacency matrix (if set to `true`) or not (if set to
+ * `false`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> generate_rmat_edgelist(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t num_edges,
+  double a           = 0.57,
+  double b           = 0.19,
+  double c           = 0.19,
+  uint64_t seed      = 0,
+  bool clip_and_flip = false);
+
+enum class generator_distribution_t { POWER_LAW = 0, UNIFORM };
+
+/**
+ * @brief generate multiple edge lists using the R-mat graph generator.
+ *
+ * This function allows multi-edges and self-loops similar to the Graph 500 reference
+ * implementation.
+ *
+ * NOTE: The scramble_vertex_ids function needs to be called in order to generate a
+ * graph conforming to the Graph 500 specification (note that scrambling does not
+ * affect cuGraph's graph construction performance, so this is generally unnecessary).
+ * If `edge_factor` is given (e.g. Graph 500), set @p num_edges to
+ * (size_t{1} << @p scale) * `edge_factor`. To generate an undirected graph, set @p b == @p c and @p
+ * clip_and_flip = true. All the resulting edges will be placed in the lower triangular part
+ * (including the diagonal) of the graph adjacency matrix.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param n_edgelists Number of edge lists (graphs) to generate
+ * @param min_scale Scale factor to set the minimum number of verties in the graph.
+ * @param max_scale Scale factor to set the maximum number of verties in the graph.
+ * @param edge_factor Average number of edges per vertex to generate.
+ * @param size_distribution Distribution of the graph sizes, impacts the scale parameter of the
+ * R-MAT generator
+ * @param edge_distribution Edges distribution for each graph, impacts how R-MAT parameters a,b,c,d,
+ * are set.
+ * @param seed Seed value for the random number generator.
+ * @param clip_and_flip Flag controlling whether to generate edges only in the lower triangular part
+ * (including the diagonal) of the graph adjacency matrix (if set to `true`) or not (if set to
+ * `false`).
+ * @return A vector of std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> of
+ *size @p n_edgelists, each vector element being a tuple of rmm::device_uvector objects for edge
+ *source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::vector<std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>>
+generate_rmat_edgelists(
+  raft::handle_t const& handle,
+  size_t n_edgelists,
+  size_t min_scale,
+  size_t max_scale,
+  size_t edge_factor                         = 16,
+  generator_distribution_t size_distribution = generator_distribution_t::POWER_LAW,
+  generator_distribution_t edge_distribution = generator_distribution_t::POWER_LAW,
+  uint64_t seed                              = 0,
+  bool clip_and_flip                         = false);
+
+/**
+ * @brief generate an edge list for path graph
+ *
+ * A path graph of size n connects the vertices from 0 to (n - 1)
+ * in a single long path: ((0,1), (1,2), ..., (n - 2, n - 1)
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * the path will span all GPUs including an edge from the last vertex on
+ * GPU i to the first vertex on GPU (i+1)
+ *
+ * This function will generate a collection of path graphs.  @p component_parameters_v
+ * defines the parameters for generating each component.  Each element of
+ * @p component_parameters_v defines a tuple consisting of the number of vertices
+ * and the base vertex id for the component.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param component_parameters_v A vector containing tuples consisting of the number of vertices and
+ * base vertex id for each component to generate.
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_path_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t>> const& component_parameters_v);
+
+/**
+ * @brief generate an edge list for a 2D Mesh Graph
+ *
+ * A sequence of 2D mesh graphs will be constructed according to the
+ * component specifications.  Each 2D mesh graph is configured with a tuple
+ * containing (x, y, base_vertex_id).  @p component_parameters_v will contain
+ * a tuple for each component.
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * each GPU will generate disjoint 2D mesh constructs of equal size.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param component_parameters_v Vector containing tuple defining the configuration of each
+ * component
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_2d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t>> const& component_parameters_v);
+
+/**
+ * @brief generate an edge list for a 3D Mesh Graph
+ *
+ * A sequence of 3D mesh graphs will be constructed according to the
+ * component specifications.  Each 3D mesh graph is configured with a tuple
+ * containing (x, y, z, base_vertex_id).  @p component_parameters_v will contain
+ * a tuple for each component.
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * each GPU will generate disjoint 3D mesh constructs of equal size.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param component_parameters_v Vector containing tuple defining the configuration of each
+ * component
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_3d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t, vertex_t>> const& component_parameters_v);
+
+/**
+ * @brief generate an edge lists for some complete graphs
+ *
+ * A sequence of complete graphs will be constructed according to the
+ * component specifications.  Each complete graph is configured with a tuple
+ * containing (n, base_vertex_id).  @p component_parameters_v will contain
+ * a tuple for each component.
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * each GPU will generate disjoint complete graph constructs of equal size.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param component_parameters_v Vector containing tuple defining the configuration of each
+ * component
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_complete_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t>> const& component_parameters_v);
+
+/**
+ * @brief generate an edge lists for an Erdos-Renyi graph
+ *
+ * This API supports the G(n,p) model which requires O(n^2) work.
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * each GPU will generate Erdos-Renyi edges for its portion of the 2D
+ * partitioning of the adjacency matrix.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param num_vertices Number of vertices to use in the generated graph
+ * @param p Probability for edge creation
+ * @param base_vertex_id Starting vertex id for the generated graph
+ * @param seed Seed value for the random number generator.
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
+                                        vertex_t num_vertices,
+                                        float p,
+                                        vertex_t base_vertex_id,
+                                        uint64_t seed = 0);
+
+/**
+ * @brief generate an edge lists for an Erdos-Renyi graph
+ *
+ * This API supports the G(n,m) model
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * each GPU will generate Erdos-Renyi edges for its portion of the 2D
+ * partitioning of the adjacency matrix.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param num_vertices Number of vertices to use in each complete graph
+ * @param m Number of edges to generate
+ * @param base_vertex_id Starting vertex id for the generated graph
+ * @param seed Seed value for the random number generator.
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle,
+                                        vertex_t num_vertices,
+                                        size_t m,
+                                        vertex_t base_vertex_id,
+                                        uint64_t seed = 0);
+
+/**
+ * @brief symmetrize an edgelist
+ *
+ * Given an edgelist for a graph, symmetrize and deduplicate edges.
+ *
+ * If a duplicate edge exists in a weighted graph, one of the weights is arbitrarily
+ * returned.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of weights.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param d_src_v Vector of source vertices
+ * @param d_dst_v Vector of destination vertices
+ * @param d_weights_v Optional vector of edge weights
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+symmetrize_edgelist(raft::handle_t const& handle,
+                    rmm::device_uvector<vertex_t>&& d_src_v,
+                    rmm::device_uvector<vertex_t>&& d_dst_v,
+                    std::optional<rmm::device_uvector<weight_t>>&& optional_d_weights_v);
+
+/**
+ * @brief scramble vertex ids in a graph
+ *
+ * Given an edgelist for a graph, scramble all vertex ids by the given offset.
+ * This translation is done in place.
+ *
+ * The scramble code here follows the algorithm in the Graph 500 reference
+ * implementation version 3.0.0.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param d_src_v Vector of source vertices
+ * @param d_dst_v Vector of destination vertices
+ * @param vertex_id_offset Offset to add to each vertex id
+ * @param seed Used to initialize random number generator
+ */
+template <typename vertex_t>
+void scramble_vertex_ids(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& d_src_v,
+                         rmm::device_uvector<vertex_t>& d_dst_v,
+                         vertex_t vertex_id_offset,
+                         uint64_t seed = 0);
+
+/**
+ * @brief Combine edgelists from multiple sources into a single edgelist
+ *
+ * If executed in a multi-gpu context (handle comms has been initialized)
+ * each GPU will operate only on its subset of data.  Any shuffling to get
+ * edges onto the same GPU should be done prior to calling this function.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param sources The source vertex ids to combine
+ * @param dests The destination vertex ids to combine
+ * @param weights Optional vector of weights to combine
+ * @param remove_multi_edges If true (the default) then remove multi edges, if false leave them in
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>,
+ * rmm::device_uvector<weight_t>> A tuple of rmm::device_uvector objects for edge source vertex IDs
+ * and edge destination vertex IDs and edge weights.
+ */
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+combine_edgelists(raft::handle_t const& handle,
+                  std::vector<rmm::device_uvector<vertex_t>>&& d_sources,
+                  std::vector<rmm::device_uvector<vertex_t>>&& d_dests,
+                  std::optional<std::vector<rmm::device_uvector<weight_t>>>&& optional_d_weights,
+                  bool remove_multi_edges = true);
+
+}  // namespace cugraph
diff --git a/cpp/include/internals.hpp b/cpp/include/cugraph/internals.hpp
similarity index 83%
rename from cpp/include/internals.hpp
rename to cpp/include/cugraph/internals.hpp
index f71426491e3..1c311304ae4 100644
--- a/cpp/include/internals.hpp
+++ b/cpp/include/cugraph/internals.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,9 +35,9 @@ class GraphBasedDimRedCallback : public Callback {
     this->n_components = n_components;
     this->isFloat      = std::is_same<T, float>::value;
   }
-  virtual void on_preprocess_end(void *positions) = 0;
-  virtual void on_epoch_end(void *positions)      = 0;
-  virtual void on_train_end(void *positions)      = 0;
+  virtual void on_preprocess_end(void* positions) = 0;
+  virtual void on_epoch_end(void* positions)      = 0;
+  virtual void on_train_end(void* positions)      = 0;
 
  protected:
   int n;
diff --git a/cpp/include/eidecl_graph.hpp b/cpp/include/cugraph/legacy/eidecl_graph.hpp
similarity index 98%
rename from cpp/include/eidecl_graph.hpp
rename to cpp/include/cugraph/legacy/eidecl_graph.hpp
index 03f6a675597..d636b7fba5b 100644
--- a/cpp/include/eidecl_graph.hpp
+++ b/cpp/include/cugraph/legacy/eidecl_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 namespace cugraph {
+namespace legacy {
 extern template class GraphViewBase<int32_t, int32_t, float>;
 extern template class GraphViewBase<int32_t, int32_t, double>;
 extern template class GraphViewBase<int32_t, int64_t, float>;
@@ -88,4 +89,5 @@ extern template class GraphCSC<int64_t, int32_t, float>;
 extern template class GraphCSC<int64_t, int32_t, double>;
 extern template class GraphCSC<int64_t, int64_t, float>;
 extern template class GraphCSC<int64_t, int64_t, double>;
+}  // namespace legacy
 }  // namespace cugraph
diff --git a/cpp/include/eidir_graph.hpp b/cpp/include/cugraph/legacy/eidir_graph.hpp
similarity index 97%
rename from cpp/include/eidir_graph.hpp
rename to cpp/include/cugraph/legacy/eidir_graph.hpp
index d7273b9ea37..df9f6eb8f71 100644
--- a/cpp/include/eidir_graph.hpp
+++ b/cpp/include/cugraph/legacy/eidir_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 namespace cugraph {
+namespace legacy {
 template class GraphViewBase<int32_t, int32_t, float>;
 template class GraphViewBase<int32_t, int32_t, double>;
 template class GraphViewBase<int32_t, int64_t, float>;
@@ -70,4 +71,5 @@ template class GraphCSC<int32_t, int64_t, float>;
 template class GraphCSC<int32_t, int64_t, double>;
 template class GraphCSC<int64_t, int64_t, float>;
 template class GraphCSC<int64_t, int64_t, double>;
+}  // namespace legacy
 }  // namespace cugraph
diff --git a/cpp/include/graph.hpp b/cpp/include/cugraph/legacy/graph.hpp
similarity index 89%
rename from cpp/include/graph.hpp
rename to cpp/include/cugraph/legacy/graph.hpp
index b30159566b5..4345fd225c1 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/cugraph/legacy/graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <rmm/device_buffer.hpp>
 
 namespace cugraph {
+namespace legacy {
 
 enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE };
 
@@ -57,17 +58,21 @@ class GraphViewBase {
   using edge_type   = edge_t;
   using weight_type = weight_t;
 
-  raft::handle_t *handle;
-  weight_t *edge_data;  ///< edge weight
+  raft::handle_t* handle;
+  weight_t* edge_data;  ///< edge weight
 
   GraphProperties prop;
 
   vertex_t number_of_vertices;
   edge_t number_of_edges;
 
-  vertex_t *local_vertices;
-  edge_t *local_edges;
-  vertex_t *local_offsets;
+  vertex_t* local_vertices;
+  edge_t* local_edges;
+  vertex_t* local_offsets;
+
+  vertex_t get_number_of_vertices() const { return number_of_vertices; }
+
+  vertex_t get_local_vertex_first() const { return vertex_t{0}; }
 
   /**
    * @brief      Fill the identifiers array with the vertex identifiers.
@@ -75,18 +80,18 @@ class GraphViewBase {
    * @param[out]    identifiers      Pointer to device memory to store the vertex
    * identifiers
    */
-  void get_vertex_identifiers(vertex_t *identifiers) const;
+  void get_vertex_identifiers(vertex_t* identifiers) const;
 
-  void set_local_data(vertex_t *vertices, edge_t *edges, vertex_t *offsets)
+  void set_local_data(vertex_t* vertices, edge_t* edges, vertex_t* offsets)
   {
     local_vertices = vertices;
     local_edges    = edges;
     local_offsets  = offsets;
   }
 
-  void set_handle(raft::handle_t *handle_in) { handle = handle_in; }
+  void set_handle(raft::handle_t* handle_in) { handle = handle_in; }
 
-  GraphViewBase(weight_t *edge_data, vertex_t number_of_vertices, edge_t number_of_edges)
+  GraphViewBase(weight_t* edge_data, vertex_t number_of_vertices, edge_t number_of_edges)
     : handle(nullptr),
       edge_data(edge_data),
       prop(),
@@ -111,8 +116,8 @@ class GraphViewBase {
 template <typename vertex_t, typename edge_t, typename weight_t>
 class GraphCOOView : public GraphViewBase<vertex_t, edge_t, weight_t> {
  public:
-  vertex_t *src_indices{nullptr};  ///< rowInd
-  vertex_t *dst_indices{nullptr};  ///< colInd
+  vertex_t* src_indices{nullptr};  ///< rowInd
+  vertex_t* dst_indices{nullptr};  ///< colInd
 
   /**
    * @brief     Computes degree(in, out, in+out) of all the nodes of a Graph
@@ -124,7 +129,7 @@ class GraphCOOView : public GraphViewBase<vertex_t, edge_t, weight_t> {
    * to zeros. Will contain the computed degree of every vertex.
    * @param[in]  direction             IN_PLUS_OUT, IN or OUT
    */
-  void degree(edge_t *degree, DegreeDirection direction) const;
+  void degree(edge_t* degree, DegreeDirection direction) const;
 
   /**
    * @brief      Default constructor
@@ -151,9 +156,9 @@ class GraphCOOView : public GraphViewBase<vertex_t, edge_t, weight_t> {
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCOOView(vertex_t *src_indices,
-               vertex_t *dst_indices,
-               weight_t *edge_data,
+  GraphCOOView(vertex_t* src_indices,
+               vertex_t* dst_indices,
+               weight_t* edge_data,
                vertex_t number_of_vertices,
                edge_t number_of_edges)
     : GraphViewBase<vertex_t, edge_t, weight_t>(edge_data, number_of_vertices, number_of_edges),
@@ -175,8 +180,8 @@ class GraphCOOView : public GraphViewBase<vertex_t, edge_t, weight_t> {
 template <typename vertex_t, typename edge_t, typename weight_t>
 class GraphCompressedSparseBaseView : public GraphViewBase<vertex_t, edge_t, weight_t> {
  public:
-  edge_t *offsets{nullptr};    ///< CSR offsets
-  vertex_t *indices{nullptr};  ///< CSR indices
+  edge_t* offsets{nullptr};    ///< CSR offsets
+  vertex_t* indices{nullptr};  ///< CSR indices
 
   /**
    * @brief      Fill the identifiers in the array with the source vertex
@@ -185,7 +190,7 @@ class GraphCompressedSparseBaseView : public GraphViewBase<vertex_t, edge_t, wei
    * @param[out]    src_indices      Pointer to device memory to store the
    * source vertex identifiers
    */
-  void get_source_indices(vertex_t *src_indices) const;
+  void get_source_indices(vertex_t* src_indices) const;
 
   /**
    * @brief     Computes degree(in, out, in+out) of all the nodes of a Graph
@@ -201,7 +206,7 @@ class GraphCompressedSparseBaseView : public GraphViewBase<vertex_t, edge_t, wei
    *                                      1 : in-degree
    *                                      2 : out-degree
    */
-  void degree(edge_t *degree, DegreeDirection direction) const;
+  void degree(edge_t* degree, DegreeDirection direction) const;
 
   /**
    * @brief      Wrap existing arrays representing adjacency lists in a Graph.
@@ -224,9 +229,9 @@ class GraphCompressedSparseBaseView : public GraphViewBase<vertex_t, edge_t, wei
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCompressedSparseBaseView(edge_t *offsets,
-                                vertex_t *indices,
-                                weight_t *edge_data,
+  GraphCompressedSparseBaseView(edge_t* offsets,
+                                vertex_t* indices,
+                                weight_t* edge_data,
                                 vertex_t number_of_vertices,
                                 edge_t number_of_edges)
     : GraphViewBase<vertex_t, edge_t, weight_t>(edge_data, number_of_vertices, number_of_edges),
@@ -275,9 +280,9 @@ class GraphCSRView : public GraphCompressedSparseBaseView<vertex_t, edge_t, weig
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCSRView(edge_t *offsets,
-               vertex_t *indices,
-               weight_t *edge_data,
+  GraphCSRView(edge_t* offsets,
+               vertex_t* indices,
+               weight_t* edge_data,
                vertex_t number_of_vertices,
                edge_t number_of_edges)
     : GraphCompressedSparseBaseView<vertex_t, edge_t, weight_t>(
@@ -326,9 +331,9 @@ class GraphCSCView : public GraphCompressedSparseBaseView<vertex_t, edge_t, weig
    * @param  number_of_vertices    The number of vertices in the graph
    * @param  number_of_edges       The number of edges in the graph
    */
-  GraphCSCView(edge_t *offsets,
-               vertex_t *indices,
-               weight_t *edge_data,
+  GraphCSCView(edge_t* offsets,
+               vertex_t* indices,
+               weight_t* edge_data,
                vertex_t number_of_vertices,
                edge_t number_of_edges)
     : GraphCompressedSparseBaseView<vertex_t, edge_t, weight_t>(
@@ -394,7 +399,7 @@ class GraphCOO {
            edge_t number_of_edges,
            bool has_data                       = false,
            cudaStream_t stream                 = nullptr,
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : number_of_vertices_p(number_of_vertices),
       number_of_edges_p(number_of_edges),
       src_indices_p(sizeof(vertex_t) * number_of_edges, stream, mr),
@@ -403,9 +408,9 @@ class GraphCOO {
   {
   }
 
-  GraphCOO(GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
+  GraphCOO(GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
            cudaStream_t stream                 = nullptr,
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : number_of_vertices_p(graph.number_of_vertices),
       number_of_edges_p(graph.number_of_edges),
       src_indices_p(graph.src_indices, graph.number_of_edges * sizeof(vertex_t), stream, mr),
@@ -416,7 +421,7 @@ class GraphCOO {
         rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(weight_t), stream, mr};
     }
   }
-  GraphCOO(GraphCOOContents<vertex_t, edge_t, weight_t> &&contents)
+  GraphCOO(GraphCOOContents<vertex_t, edge_t, weight_t>&& contents)
     : number_of_vertices_p(contents.number_of_vertices),
       number_of_edges_p(contents.number_of_edges),
       src_indices_p(std::move(*(contents.src_indices.release()))),
@@ -427,9 +432,9 @@ class GraphCOO {
 
   vertex_t number_of_vertices(void) { return number_of_vertices_p; }
   edge_t number_of_edges(void) { return number_of_edges_p; }
-  vertex_t *src_indices(void) { return static_cast<vertex_t *>(src_indices_p.data()); }
-  vertex_t *dst_indices(void) { return static_cast<vertex_t *>(dst_indices_p.data()); }
-  weight_t *edge_data(void) { return static_cast<weight_t *>(edge_data_p.data()); }
+  vertex_t* src_indices(void) { return static_cast<vertex_t*>(src_indices_p.data()); }
+  vertex_t* dst_indices(void) { return static_cast<vertex_t*>(dst_indices_p.data()); }
+  weight_t* edge_data(void) { return static_cast<weight_t*>(edge_data_p.data()); }
 
   GraphCOOContents<vertex_t, edge_t, weight_t> release() noexcept
   {
@@ -500,7 +505,7 @@ class GraphCompressedSparseBase {
                             edge_t number_of_edges,
                             bool has_data,
                             cudaStream_t stream,
-                            rmm::mr::device_memory_resource *mr)
+                            rmm::mr::device_memory_resource* mr)
     : number_of_vertices_p(number_of_vertices),
       number_of_edges_p(number_of_edges),
       offsets_p(sizeof(edge_t) * (number_of_vertices + 1), stream, mr),
@@ -509,7 +514,7 @@ class GraphCompressedSparseBase {
   {
   }
 
-  GraphCompressedSparseBase(GraphSparseContents<vertex_t, edge_t, weight_t> &&contents)
+  GraphCompressedSparseBase(GraphSparseContents<vertex_t, edge_t, weight_t>&& contents)
     : number_of_vertices_p(contents.number_of_vertices),
       number_of_edges_p(contents.number_of_edges),
       offsets_p(std::move(*contents.offsets.release())),
@@ -520,9 +525,9 @@ class GraphCompressedSparseBase {
 
   vertex_t number_of_vertices(void) { return number_of_vertices_p; }
   edge_t number_of_edges(void) { return number_of_edges_p; }
-  edge_t *offsets(void) { return static_cast<edge_t *>(offsets_p.data()); }
-  vertex_t *indices(void) { return static_cast<vertex_t *>(indices_p.data()); }
-  weight_t *edge_data(void) { return static_cast<weight_t *>(edge_data_p.data()); }
+  edge_t* offsets(void) { return static_cast<edge_t*>(offsets_p.data()); }
+  vertex_t* indices(void) { return static_cast<vertex_t*>(indices_p.data()); }
+  weight_t* edge_data(void) { return static_cast<weight_t*>(edge_data_p.data()); }
 
   GraphSparseContents<vertex_t, edge_t, weight_t> release() noexcept
   {
@@ -570,13 +575,13 @@ class GraphCSR : public GraphCompressedSparseBase<vertex_t, edge_t, weight_t> {
            edge_t number_of_edges_,
            bool has_data_                      = false,
            cudaStream_t stream                 = nullptr,
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : GraphCompressedSparseBase<vertex_t, edge_t, weight_t>(
         number_of_vertices_, number_of_edges_, has_data_, stream, mr)
   {
   }
 
-  GraphCSR(GraphSparseContents<vertex_t, edge_t, weight_t> &&contents)
+  GraphCSR(GraphSparseContents<vertex_t, edge_t, weight_t>&& contents)
     : GraphCompressedSparseBase<vertex_t, edge_t, weight_t>(std::move(contents))
   {
   }
@@ -621,13 +626,13 @@ class GraphCSC : public GraphCompressedSparseBase<vertex_t, edge_t, weight_t> {
            edge_t number_of_edges_in,
            bool has_data_in                    = false,
            cudaStream_t stream                 = nullptr,
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : GraphCompressedSparseBase<vertex_t, edge_t, weight_t>(
         number_of_vertices_in, number_of_edges_in, has_data_in, stream, mr)
   {
   }
 
-  GraphCSC(GraphSparseContents<vertex_t, edge_t, weight_t> &&contents)
+  GraphCSC(GraphSparseContents<vertex_t, edge_t, weight_t>&& contents)
     : GraphCompressedSparseBase<vertex_t, edge_t, weight_t>(
         std::forward<GraphSparseContents<vertex_t, edge_t, weight_t>>(contents))
   {
@@ -668,6 +673,8 @@ struct invalid_vertex_id : invalid_idx<vertex_t> {
 template <typename edge_t>
 struct invalid_edge_id : invalid_idx<edge_t> {
 };
+
+}  // namespace legacy
 }  // namespace cugraph
 
 #include "eidecl_graph.hpp"
diff --git a/cpp/include/cugraph/matrix_partition_device_view.cuh b/cpp/include/cugraph/matrix_partition_device_view.cuh
new file mode 100644
index 00000000000..f96419f9600
--- /dev/null
+++ b/cpp/include/cugraph/matrix_partition_device_view.cuh
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/optional.h>
+#include <thrust/tuple.h>
+
+#include <cassert>
+#include <optional>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+class matrix_partition_device_view_base_t {
+ public:
+  matrix_partition_device_view_base_t(edge_t const* offsets,
+                                      vertex_t const* indices,
+                                      std::optional<weight_t const*> weights,
+                                      edge_t number_of_edges)
+    : offsets_(offsets),
+      indices_(indices),
+      weights_(weights ? thrust::optional<weight_t const*>(*weights) : thrust::nullopt),
+      number_of_edges_(number_of_edges)
+  {
+  }
+
+  __host__ __device__ edge_t get_number_of_edges() const { return number_of_edges_; }
+
+  __host__ __device__ edge_t const* get_offsets() const { return offsets_; }
+  __host__ __device__ vertex_t const* get_indices() const { return indices_; }
+  __host__ __device__ thrust::optional<weight_t const*> get_weights() const { return weights_; }
+
+  // major_idx == major offset if CSR/CSC, major_offset != major_idx if DCSR/DCSC
+  __device__ thrust::tuple<vertex_t const*, thrust::optional<weight_t const*>, edge_t>
+  get_local_edges(vertex_t major_idx) const noexcept
+  {
+    auto edge_offset  = *(offsets_ + major_idx);
+    auto local_degree = *(offsets_ + (major_idx + 1)) - edge_offset;
+    auto indices      = indices_ + edge_offset;
+    auto weights =
+      weights_ ? thrust::optional<weight_t const*>{*weights_ + edge_offset} : thrust::nullopt;
+    return thrust::make_tuple(indices, weights, local_degree);
+  }
+
+  // major_idx == major offset if CSR/CSC, major_offset != major_idx if DCSR/DCSC
+  __device__ edge_t get_local_degree(vertex_t major_idx) const noexcept
+  {
+    return *(offsets_ + (major_idx + 1)) - *(offsets_ + major_idx);
+  }
+
+  // major_idx == major offset if CSR/CSC, major_offset != major_idx if DCSR/DCSC
+  __device__ edge_t get_local_offset(vertex_t major_idx) const noexcept
+  {
+    return *(offsets_ + major_idx);
+  }
+
+ private:
+  // should be trivially copyable to device
+  edge_t const* offsets_{nullptr};
+  vertex_t const* indices_{nullptr};
+  thrust::optional<weight_t const*> weights_{thrust::nullopt};
+  edge_t number_of_edges_{0};
+};
+
+}  // namespace detail
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool multi_gpu,
+          typename Enable = void>
+class matrix_partition_device_view_t;
+
+// multi-GPU version
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+class matrix_partition_device_view_t<vertex_t,
+                                     edge_t,
+                                     weight_t,
+                                     multi_gpu,
+                                     std::enable_if_t<multi_gpu>>
+  : public detail::matrix_partition_device_view_base_t<vertex_t, edge_t, weight_t> {
+ public:
+  matrix_partition_device_view_t(
+    matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu> view)
+    : detail::matrix_partition_device_view_base_t<vertex_t, edge_t, weight_t>(
+        view.get_offsets(), view.get_indices(), view.get_weights(), view.get_number_of_edges()),
+      dcs_nzd_vertices_(view.get_dcs_nzd_vertices()
+                          ? thrust::optional<vertex_t const*>{*(view.get_dcs_nzd_vertices())}
+                          : thrust::nullopt),
+      dcs_nzd_vertex_count_(view.get_dcs_nzd_vertex_count()
+                              ? thrust::optional<vertex_t>{*(view.get_dcs_nzd_vertex_count())}
+                              : thrust::nullopt),
+      major_first_(view.get_major_first()),
+      major_last_(view.get_major_last()),
+      minor_first_(view.get_minor_first()),
+      minor_last_(view.get_minor_last()),
+      major_value_start_offset_(view.get_major_value_start_offset())
+  {
+  }
+
+  __host__ __device__ vertex_t get_major_first() const noexcept { return major_first_; }
+
+  __host__ __device__ vertex_t get_major_last() const noexcept { return major_last_; }
+
+  __host__ __device__ vertex_t get_major_size() const noexcept
+  {
+    return major_last_ - major_first_;
+  }
+
+  __host__ __device__ vertex_t get_minor_first() const noexcept { return minor_first_; }
+
+  __host__ __device__ vertex_t get_minor_last() const noexcept { return minor_last_; }
+
+  __host__ __device__ vertex_t get_minor_size() const noexcept
+  {
+    return minor_last_ - minor_first_;
+  }
+
+  __host__ __device__ vertex_t get_major_offset_from_major_nocheck(vertex_t major) const noexcept
+  {
+    return major - major_first_;
+  }
+
+  __host__ __device__ vertex_t get_minor_offset_from_minor_nocheck(vertex_t minor) const noexcept
+  {
+    return minor - minor_first_;
+  }
+
+  __host__ __device__ vertex_t
+  get_major_from_major_offset_nocheck(vertex_t major_offset) const noexcept
+  {
+    return major_first_ + major_offset;
+  }
+
+  // major_hypersparse_idx: index within the hypersparse segment
+  __host__ __device__ thrust::optional<vertex_t> get_major_hypersparse_idx_from_major_nocheck(
+    vertex_t major) const noexcept
+  {
+    if (dcs_nzd_vertices_) {
+      // we can avoid binary search (and potentially improve performance) if we add an auxiliary
+      // array or cuco::static_map (at the expense of additional memory)
+      auto it = thrust::lower_bound(
+        thrust::seq, *dcs_nzd_vertices_, *dcs_nzd_vertices_ + *dcs_nzd_vertex_count_, major);
+      return it != *dcs_nzd_vertices_ + *dcs_nzd_vertex_count_
+               ? (*it == major ? thrust::optional<vertex_t>{static_cast<vertex_t>(
+                                   thrust::distance(*dcs_nzd_vertices_, it))}
+                               : thrust::nullopt)
+               : thrust::nullopt;
+    } else {
+      return thrust::nullopt;
+    }
+  }
+
+  // major_hypersparse_idx: index within the hypersparse segment
+  __host__ __device__ thrust::optional<vertex_t> get_major_from_major_hypersparse_idx_nocheck(
+    vertex_t major_hypersparse_idx) const noexcept
+  {
+    return dcs_nzd_vertices_
+             ? thrust::optional<vertex_t>{(*dcs_nzd_vertices_)[major_hypersparse_idx]}
+             : thrust::nullopt;
+  }
+
+  __host__ __device__ vertex_t
+  get_minor_from_minor_offset_nocheck(vertex_t minor_offset) const noexcept
+  {
+    return minor_first_ + minor_offset;
+  }
+
+  __host__ __device__ vertex_t get_major_value_start_offset() const
+  {
+    return major_value_start_offset_;
+  }
+
+  __host__ __device__ thrust::optional<vertex_t const*> get_dcs_nzd_vertices() const
+  {
+    return dcs_nzd_vertices_;
+  }
+  __host__ __device__ thrust::optional<vertex_t> get_dcs_nzd_vertex_count() const
+  {
+    return dcs_nzd_vertex_count_;
+  }
+
+ private:
+  // should be trivially copyable to device
+
+  thrust::optional<vertex_t const*> dcs_nzd_vertices_{nullptr};
+  thrust::optional<vertex_t> dcs_nzd_vertex_count_{0};
+
+  vertex_t major_first_{0};
+  vertex_t major_last_{0};
+  vertex_t minor_first_{0};
+  vertex_t minor_last_{0};
+
+  vertex_t major_value_start_offset_{0};
+};
+
+// single-GPU version
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+class matrix_partition_device_view_t<vertex_t,
+                                     edge_t,
+                                     weight_t,
+                                     multi_gpu,
+                                     std::enable_if_t<!multi_gpu>>
+  : public detail::matrix_partition_device_view_base_t<vertex_t, edge_t, weight_t> {
+ public:
+  matrix_partition_device_view_t(
+    matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu> view)
+    : detail::matrix_partition_device_view_base_t<vertex_t, edge_t, weight_t>(
+        view.get_offsets(), view.get_indices(), view.get_weights(), view.get_number_of_edges()),
+      number_of_vertices_(view.get_major_last())
+  {
+  }
+
+  __host__ __device__ vertex_t get_major_value_start_offset() const { return vertex_t{0}; }
+
+  __host__ __device__ constexpr vertex_t get_major_first() const noexcept { return vertex_t{0}; }
+
+  __host__ __device__ vertex_t get_major_last() const noexcept { return number_of_vertices_; }
+
+  __host__ __device__ vertex_t get_major_size() const noexcept { return number_of_vertices_; }
+
+  __host__ __device__ constexpr vertex_t get_minor_first() const noexcept { return vertex_t{0}; }
+
+  __host__ __device__ vertex_t get_minor_last() const noexcept { return number_of_vertices_; }
+
+  __host__ __device__ vertex_t get_minor_size() const noexcept { return number_of_vertices_; }
+
+  __host__ __device__ vertex_t get_major_offset_from_major_nocheck(vertex_t major) const noexcept
+  {
+    return major;
+  }
+
+  __host__ __device__ vertex_t get_minor_offset_from_minor_nocheck(vertex_t minor) const noexcept
+  {
+    return minor;
+  }
+
+  __host__ __device__ vertex_t
+  get_major_from_major_offset_nocheck(vertex_t major_offset) const noexcept
+  {
+    return major_offset;
+  }
+
+  // major_hypersparse_idx: index within the hypersparse segment
+  __host__ __device__ thrust::optional<vertex_t> get_major_hypersparse_idx_from_major_nocheck(
+    vertex_t major) const noexcept
+  {
+    assert(false);
+    return thrust::nullopt;
+  }
+
+  // major_hypersparse_idx: index within the hypersparse segment
+  __host__ __device__ thrust::optional<vertex_t> get_major_from_major_hypersparse_idx_nocheck(
+    vertex_t major_hypersparse_idx) const noexcept
+  {
+    assert(false);
+    return thrust::nullopt;
+  }
+
+  __host__ __device__ vertex_t
+  get_minor_from_minor_offset_nocheck(vertex_t minor_offset) const noexcept
+  {
+    return minor_offset;
+  }
+
+  __host__ __device__ thrust::optional<vertex_t const*> get_dcs_nzd_vertices() const
+  {
+    assert(false);
+    return thrust::nullopt;
+  }
+  __host__ __device__ thrust::optional<vertex_t> get_dcs_nzd_vertex_count() const
+  {
+    assert(false);
+    return thrust::nullopt;
+  }
+
+ private:
+  vertex_t number_of_vertices_;
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/matrix_partition_view.hpp b/cpp/include/cugraph/matrix_partition_view.hpp
new file mode 100644
index 00000000000..6036bd2af48
--- /dev/null
+++ b/cpp/include/cugraph/matrix_partition_view.hpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+class matrix_partition_view_base_t {
+ public:
+  matrix_partition_view_base_t(edge_t const* offsets,
+                               vertex_t const* indices,
+                               std::optional<weight_t const*> weights,
+                               edge_t number_of_edges)
+    : offsets_(offsets), indices_(indices), weights_(weights), number_of_edges_(number_of_edges)
+  {
+  }
+
+  edge_t get_number_of_edges() const { return number_of_edges_; }
+
+  edge_t const* get_offsets() const { return offsets_; }
+  vertex_t const* get_indices() const { return indices_; }
+  std::optional<weight_t const*> get_weights() const { return weights_; }
+
+ private:
+  edge_t const* offsets_{nullptr};
+  vertex_t const* indices_{nullptr};
+  std::optional<weight_t const*> weights_{std::nullopt};
+  edge_t number_of_edges_{0};
+};
+
+}  // namespace detail
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool multi_gpu,
+          typename Enable = void>
+class matrix_partition_view_t;
+
+// multi-GPU version
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+class matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu, std::enable_if_t<multi_gpu>>
+  : public detail::matrix_partition_view_base_t<vertex_t, edge_t, weight_t> {
+ public:
+  matrix_partition_view_t(edge_t const* offsets,
+                          vertex_t const* indices,
+                          std::optional<weight_t const*> weights,
+                          std::optional<vertex_t const*> dcs_nzd_vertices,
+                          std::optional<vertex_t> dcs_nzd_vertex_count,
+                          edge_t number_of_matrix_partition_edges,
+                          vertex_t major_first,
+                          vertex_t major_last,
+                          vertex_t minor_first,
+                          vertex_t minor_last,
+                          vertex_t major_value_start_offset)
+    : detail::matrix_partition_view_base_t<vertex_t, edge_t, weight_t>(
+        offsets, indices, weights, number_of_matrix_partition_edges),
+      dcs_nzd_vertices_(dcs_nzd_vertices),
+      dcs_nzd_vertex_count_(dcs_nzd_vertex_count),
+      major_first_(major_first),
+      major_last_(major_last),
+      minor_first_(minor_first),
+      minor_last_(minor_last),
+      major_value_start_offset_(major_value_start_offset)
+  {
+  }
+
+  std::optional<vertex_t const*> get_dcs_nzd_vertices() const { return dcs_nzd_vertices_; }
+  std::optional<vertex_t> get_dcs_nzd_vertex_count() const { return dcs_nzd_vertex_count_; }
+
+  vertex_t get_major_first() const { return major_first_; }
+  vertex_t get_major_last() const { return major_last_; }
+  vertex_t get_minor_first() const { return minor_first_; }
+  vertex_t get_minor_last() const { return minor_last_; }
+
+  vertex_t get_major_value_start_offset() const { return major_value_start_offset_; }
+
+ private:
+  // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
+  std::optional<vertex_t const*> dcs_nzd_vertices_{};
+  std::optional<vertex_t> dcs_nzd_vertex_count_{};
+
+  vertex_t major_first_{0};
+  vertex_t major_last_{0};
+  vertex_t minor_first_{0};
+  vertex_t minor_last_{0};
+
+  vertex_t major_value_start_offset_{0};
+};
+
+// single-GPU version
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+class matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu, std::enable_if_t<!multi_gpu>>
+  : public detail::matrix_partition_view_base_t<vertex_t, edge_t, weight_t> {
+ public:
+  matrix_partition_view_t(edge_t const* offsets,
+                          vertex_t const* indices,
+                          std::optional<weight_t const*> weights,
+                          vertex_t number_of_vertices,
+                          edge_t number_of_edges)
+    : detail::matrix_partition_view_base_t<vertex_t, edge_t, weight_t>(
+        offsets, indices, weights, number_of_edges),
+      number_of_vertices_(number_of_vertices)
+  {
+  }
+
+  std::optional<vertex_t const*> get_dcs_nzd_vertices() const { return std::nullopt; }
+  std::optional<vertex_t> get_dcs_nzd_vertex_count() const { return std::nullopt; }
+
+  vertex_t get_major_first() const { return vertex_t{0}; }
+  vertex_t get_major_last() const { return number_of_vertices_; }
+  vertex_t get_minor_first() const { return vertex_t{0}; }
+  vertex_t get_minor_last() const { return number_of_vertices_; }
+
+ private:
+  vertex_t number_of_vertices_{0};
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp
similarity index 98%
rename from cpp/include/partition_manager.hpp
rename to cpp/include/cugraph/partition_manager.hpp
index 431655e5642..c7657d459b2 100644
--- a/cpp/include/partition_manager.hpp
+++ b/cpp/include/cugraph/partition_manager.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
similarity index 52%
rename from cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
rename to cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index 760775c03d4..1aedd952cf6 100644
--- a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,16 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <matrix_partition_device.cuh>
-#include <partition_manager.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
-#include <utilities/thrust_tuple_utils.cuh>
-#include <vertex_partition_device.cuh>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_barrier.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
+#include <cugraph/vertex_partition_device_view.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
@@ -49,31 +52,49 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                           MatrixMajorValueOutputIterator matrix_major_value_output_first)
 {
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      std::vector<size_t> rx_counts(row_comm_size, size_t{0});
-      std::vector<size_t> displacements(row_comm_size, size_t{0});
-      for (int i = 0; i < row_comm_size; ++i) {
-        rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
-        displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
-      }
-      device_allgatherv(row_comm,
-                        vertex_value_input_first,
-                        matrix_major_value_output_first,
-                        rx_counts,
-                        displacements,
-                        handle.get_stream());
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    std::vector<size_t> rx_counts(col_comm_size, size_t{0});
+    std::vector<size_t> displacements(col_comm_size, size_t{0});
+    for (int i = 0; i < col_comm_size; ++i) {
+      rx_counts[i]     = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank);
+      displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
     }
+    device_allgatherv(col_comm,
+                      vertex_value_input_first,
+                      matrix_major_value_output_first,
+                      rx_counts,
+                      displacements,
+                      handle.get_stream());
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
   } else {
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_cols()
@@ -97,90 +118,110 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                           MatrixMajorValueOutputIterator matrix_major_value_output_first)
 {
   using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
 
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      auto rx_counts =
-        host_scalar_allgather(row_comm,
-                              static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
-                              handle.get_stream());
-
-      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
-      for (int i = 0; i < row_comm_size; ++i) {
-        rmm::device_uvector<vertex_t> rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
-                                                  handle.get_stream());
-        auto rx_tmp_buffer =
-          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-            rx_counts[i], handle.get_stream());
-        auto rx_value_first = get_comm_buffer_begin<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
-
-        if (row_comm_rank == i) {
-          vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
-              return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
-            });
-          // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
-          // permutation iterator (and directly gathers to the internal buffer)
-          thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                         map_first,
-                         map_first + thrust::distance(vertex_first, vertex_last),
-                         vertex_value_input_first,
-                         rx_value_first);
-        }
-
-        // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-        // ncclGroupEnd()
-        device_bcast(
-          row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
-        device_bcast(
-          row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
-
-        if (row_comm_rank == i) {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_major_offset_from_major_nocheck(v);
-            });
-          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-          // directly scatters from the internal buffer)
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          rx_value_first,
-                          rx_value_first + rx_counts[i],
-                          map_first,
-                          matrix_major_value_output_first);
-        } else {
-          auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_major_offset_from_major_nocheck(v);
-            });
-          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-          // directly scatters from the internal buffer)
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          rx_value_first,
-                          rx_value_first + rx_counts[i],
-                          map_first,
-                          matrix_major_value_output_first);
-        }
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary rx_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block (FIXME: we can reduce stream
-                                  // synchronization if we compute the maximum rx_counts and
-                                  // allocate rx_tmp_buffer outside the loop)
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    auto rx_counts =
+      host_scalar_allgather(col_comm,
+                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                            handle.get_stream());
+
+    for (int i = 0; i < col_comm_size; ++i) {
+      auto matrix_partition =
+        matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+          graph_view.get_matrix_partition_view(i));
+
+      rmm::device_uvector<vertex_t> rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
+                                                handle.get_stream());
+      auto rx_tmp_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
+                                                                             handle.get_stream());
+      auto rx_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+
+      if (col_comm_rank == i) {
+        auto vertex_partition =
+          vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+            graph_view.get_vertex_partition_view());
+        auto map_first =
+          thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+            return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
+          });
+        // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+        // permutation iterator (and directly gathers to the internal buffer)
+        thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       map_first,
+                       map_first + thrust::distance(vertex_first, vertex_last),
+                       vertex_value_input_first,
+                       rx_value_first);
+      }
+
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      device_bcast(
+        col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+
+      if (col_comm_rank == i) {
+        auto map_first =
+          thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_major_offset_from_major_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          rx_value_first,
+          rx_value_first + rx_counts[i],
+          map_first,
+          matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
+      } else {
+        auto map_first = thrust::make_transform_iterator(
+          rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_major_offset_from_major_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          rx_value_first,
+          rx_value_first + rx_counts[i],
+          map_first,
+          matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
       }
     }
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
   } else {
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_cols()
@@ -203,59 +244,49 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                           MatrixMinorValueOutputIterator matrix_minor_value_output_first)
 {
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
-      // partitioning
-      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      // FIXME: this branch may no longer necessary with NCCL backend
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     vertex_value_input_first,
-                     vertex_value_input_first + graph_view.get_number_of_local_vertices(),
-                     matrix_minor_value_output_first +
-                       (graph_view.get_vertex_partition_first(comm_src_rank) -
-                        graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)));
-      } else {
-        device_sendrecv<VertexValueInputIterator, MatrixMinorValueOutputIterator>(
-          comm,
-          vertex_value_input_first,
-          static_cast<size_t>(graph_view.get_number_of_local_vertices()),
-          comm_dst_rank,
-          matrix_minor_value_output_first +
-            (graph_view.get_vertex_partition_first(comm_src_rank) -
-             graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
-          static_cast<size_t>(graph_view.get_vertex_partition_size(comm_src_rank)),
-          comm_src_rank,
-          handle.get_stream());
-      }
-
-      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-      // ncclGroupEnd()
-      for (int i = 0; i < col_comm_size; ++i) {
-        auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
-                      graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
-        auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i);
-        device_bcast(col_comm,
-                     matrix_minor_value_output_first + offset,
-                     matrix_minor_value_output_first + offset,
-                     count,
-                     i,
-                     handle.get_stream());
-      }
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    std::vector<size_t> rx_counts(row_comm_size, size_t{0});
+    std::vector<size_t> displacements(row_comm_size, size_t{0});
+    for (int i = 0; i < row_comm_size; ++i) {
+      rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+      displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
     }
+    device_allgatherv(row_comm,
+                      vertex_value_input_first,
+                      matrix_minor_value_output_first,
+                      rx_counts,
+                      displacements,
+                      handle.get_stream());
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
   } else {
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_rows()
@@ -279,162 +310,107 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                           MatrixMinorValueOutputIterator matrix_minor_value_output_first)
 {
   using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
 
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of
-      // hypergraph partitioning is applied or not
-      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      size_t tx_count    = thrust::distance(vertex_first, vertex_last);
-      size_t rx_count{};
-      // FIXME: it seems like raft::isend and raft::irecv do not properly handle the destination (or
-      // source) == self case. Need to double check and fix this if this is indeed the case (or RAFT
-      // may use ncclSend/ncclRecv instead of UCX for device data).
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        rx_count = tx_count;
-      } else {
-        std::vector<raft::comms::request_t> count_requests(2);
-        comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data());
-        comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1);
-        comm.waitall(count_requests.size(), count_requests.data());
-      }
-
-      vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
-      rmm::device_uvector<vertex_t> dst_vertices(rx_count, handle.get_stream());
-      auto dst_tmp_buffer =
-        allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-          rx_count, handle.get_stream());
-      auto dst_value_first =
-        get_comm_buffer_begin<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-          dst_tmp_buffer);
-      if (comm_src_rank == comm_rank) {
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     vertex_first,
-                     vertex_last,
-                     dst_vertices.begin());
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    auto rx_counts =
+      host_scalar_allgather(row_comm,
+                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                            handle.get_stream());
+
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(size_t{0}));
+    for (int i = 0; i < row_comm_size; ++i) {
+      rmm::device_uvector<vertex_t> rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
+                                                handle.get_stream());
+      auto rx_tmp_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
+                                                                             handle.get_stream());
+      auto rx_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+
+      if (row_comm_rank == i) {
+        auto vertex_partition =
+          vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+            graph_view.get_vertex_partition_view());
         auto map_first =
           thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
             return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
           });
+        // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+        // permutation iterator (and directly gathers to the internal buffer)
         thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                        map_first,
                        map_first + thrust::distance(vertex_first, vertex_last),
                        vertex_value_input_first,
-                       dst_value_first);
-      } else {
-        auto src_tmp_buffer =
-          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-            tx_count, handle.get_stream());
-        auto src_value_first = get_comm_buffer_begin<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(src_tmp_buffer);
+                       rx_value_first);
+      }
+
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      device_bcast(
+        row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
 
+      if (row_comm_rank == i) {
         auto map_first =
-          thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
-            return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
+          thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_minor_offset_from_minor_nocheck(v);
           });
-        thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                       map_first,
-                       map_first + thrust::distance(vertex_first, vertex_last),
-                       vertex_value_input_first,
-                       src_value_first);
-
-        device_sendrecv<decltype(vertex_first), decltype(dst_vertices.begin())>(
-          comm,
-          vertex_first,
-          tx_count,
-          comm_dst_rank,
-          dst_vertices.begin(),
-          rx_count,
-          comm_src_rank,
-          handle.get_stream());
-
-        device_sendrecv<decltype(src_value_first), decltype(dst_value_first)>(comm,
-                                                                              src_value_first,
-                                                                              tx_count,
-                                                                              comm_dst_rank,
-                                                                              dst_value_first,
-                                                                              rx_count,
-                                                                              comm_src_rank,
-                                                                              handle.get_stream());
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary src_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block
-      }
-
-      // FIXME: now we can clear tx_tmp_buffer
-
-      auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream());
-
-      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
-      for (int i = 0; i < col_comm_size; ++i) {
-        rmm::device_uvector<vertex_t> rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
-                                                  handle.get_stream());
-        auto rx_tmp_buffer =
-          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-            rx_counts[i], handle.get_stream());
-        auto rx_value_first = get_comm_buffer_begin<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
-
-        // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-        // ncclGroupEnd()
-        device_bcast(col_comm,
-                     dst_vertices.begin(),
-                     rx_vertices.begin(),
-                     rx_counts[i],
-                     i,
-                     handle.get_stream());
-        device_bcast(
-          col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
-
-        if (col_comm_rank == i) {
-          auto map_first = thrust::make_transform_iterator(
-            dst_vertices.begin(), [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_minor_offset_from_minor_nocheck(v);
-            });
-
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          dst_value_first,
-                          dst_value_first + rx_counts[i],
-                          map_first,
-                          matrix_minor_value_output_first);
-        } else {
-          auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_minor_offset_from_minor_nocheck(v);
-            });
-
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          rx_value_first,
-                          rx_value_first + rx_counts[i],
-                          map_first,
-                          matrix_minor_value_output_first);
-        }
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary rx_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block (FIXME: we can reduce stream
-                                  // synchronization if we compute the maximum rx_counts and
-                                  // allocate rx_tmp_buffer outside the loop)
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_value_first,
+                        rx_value_first + rx_counts[i],
+                        map_first,
+                        matrix_minor_value_output_first);
+      } else {
+        auto map_first = thrust::make_transform_iterator(
+          rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_value_first,
+                        rx_value_first + rx_counts[i],
+                        map_first,
+                        matrix_minor_value_output_first);
       }
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle.get_stream()));  // this is as necessary dst_tmp_buffer will become out-of-scope once
-                                // control flow exits this block
     }
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
   } else {
     assert(graph_view.get_number_of_local_vertices() ==
            graph_view.get_number_of_local_adj_matrix_partition_rows());
diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
similarity index 52%
rename from cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
rename to cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index f3c36897dd6..148549fa99a 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,14 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <matrix_partition_device.cuh>
-#include <patterns/edge_op_utils.cuh>
-#include <patterns/reduce_op.cuh>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/prims/property_op_utils.cuh>
+#include <cugraph/prims/reduce_op.cuh>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_barrier.hpp>
 
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
@@ -41,23 +43,104 @@ namespace experimental {
 
 namespace detail {
 
-// FIXME: block size requires tuning
-int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 128;
+int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 512;
 
-#if 0
-// FIXME: delete this once we verify that the thrust replace in for_all_major_for_all_nbr_low_degree is no slower than the original for loop based imoplementation
-template <bool update_major, typename T>
-__device__ std::enable_if_t<update_major, void> accumulate_edge_op_result(T& lhs, T const& rhs)
+template <bool update_major,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultValueOutputIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_hypersparse(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_hypersparse_first,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultValueOutputIterator result_value_output_first,
+  EdgeOp e_op,
+  T init /* relevent only if update_major == true */)
 {
-  lhs = plus_edge_op_result(lhs, rhs);
-}
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
 
-template <bool update_major, typename T>
-__device__ std::enable_if_t<!update_major, void> accumulate_edge_op_result(T& lhs, T const& rhs)
-{
-  atomic_add(&lhs, rhs);
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset =
+    static_cast<size_t>(major_hypersparse_first - matrix_partition.get_major_first());
+  auto idx = static_cast<size_t>(tid);
+
+  auto dcs_nzd_vertex_count = *(matrix_partition.get_dcs_nzd_vertex_count());
+
+  property_add<T> edge_property_add{};
+  while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
+    auto major =
+      *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_idx =
+      major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_idx));
+    auto transform_op = [&matrix_partition,
+                         &adj_matrix_row_value_input_first,
+                         &adj_matrix_col_value_input_first,
+                         &e_op,
+                         major,
+                         indices,
+                         weights] __device__(auto i) {
+      auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+      auto minor        = indices[i];
+      auto weight       = weights ? (*weights)[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed ? minor : major;
+      auto col          = GraphViewType::is_adj_matrix_transposed ? major : minor;
+      auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+      auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
+      return evaluate_edge_op<GraphViewType,
+                              vertex_t,
+                              AdjMatrixRowValueInputIterator,
+                              AdjMatrixColValueInputIterator,
+                              EdgeOp>()
+        .compute(row,
+                 col,
+                 weight,
+                 *(adj_matrix_row_value_input_first + row_offset),
+                 *(adj_matrix_col_value_input_first + col_offset),
+                 e_op);
+    };
+
+    if (update_major) {
+      *(result_value_output_first + (major - major_hypersparse_first)) =
+        thrust::transform_reduce(thrust::seq,
+                                 thrust::make_counting_iterator(edge_t{0}),
+                                 thrust::make_counting_iterator(local_degree),
+                                 transform_op,
+                                 init,
+                                 edge_property_add);
+    } else {
+      thrust::for_each(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        [&matrix_partition, indices, &result_value_output_first, &transform_op] __device__(auto i) {
+          auto e_op_result  = transform_op(i);
+          auto minor        = indices[i];
+          auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+          atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
+        });
+    }
+    idx += gridDim.x * blockDim.x;
+  }
 }
-#endif
 
 template <bool update_major,
           typename GraphViewType,
@@ -67,7 +150,10 @@ template <bool update_major,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_low_degree(
-  matrix_partition_device_t<GraphViewType> matrix_partition,
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
   AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
@@ -76,23 +162,22 @@ __global__ void for_all_major_for_all_nbr_low_degree(
   EdgeOp e_op,
   T init /* relevent only if update_major == true */)
 {
-  using vertex_t      = typename GraphViewType::vertex_type;
-  using edge_t        = typename GraphViewType::edge_type;
-  using weight_t      = typename GraphViewType::weight_type;
-  using e_op_result_t = T;
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
 
   auto const tid          = threadIdx.x + blockIdx.x * blockDim.x;
   auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
   auto idx                = static_cast<size_t>(tid);
 
+  property_add<T> edge_property_add{};
   while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
     vertex_t const* indices{nullptr};
-    weight_t const* weights{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
     edge_t local_degree{};
-    auto major_offset = major_start_offset + idx;
     thrust::tie(indices, weights, local_degree) =
       matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
-#if 1
     auto transform_op = [&matrix_partition,
                          &adj_matrix_row_value_input_first,
                          &adj_matrix_col_value_input_first,
@@ -101,21 +186,22 @@ __global__ void for_all_major_for_all_nbr_low_degree(
                          indices,
                          weights] __device__(auto i) {
       auto minor        = indices[i];
-      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto weight       = weights ? (*weights)[i] : weight_t{1.0};
       auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
       auto row          = GraphViewType::is_adj_matrix_transposed
-                   ? minor
-                   : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
-      auto col = GraphViewType::is_adj_matrix_transposed
-                   ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
-                   : minor;
-      auto row_offset = GraphViewType::is_adj_matrix_transposed
-                          ? minor_offset
-                          : static_cast<vertex_t>(major_offset);
-      auto col_offset = GraphViewType::is_adj_matrix_transposed
-                          ? static_cast<vertex_t>(major_offset)
-                          : minor_offset;
+                            ? minor
+                            : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+      auto col          = GraphViewType::is_adj_matrix_transposed
+                            ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                            : minor;
+      auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+      auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
       return evaluate_edge_op<GraphViewType,
+                              vertex_t,
                               AdjMatrixRowValueInputIterator,
                               AdjMatrixColValueInputIterator,
                               EdgeOp>()
@@ -128,13 +214,13 @@ __global__ void for_all_major_for_all_nbr_low_degree(
     };
 
     if (update_major) {
-      *(result_value_output_first + idx) = thrust::transform_reduce(
-        thrust::seq,
-        thrust::make_counting_iterator(edge_t{0}),
-        thrust::make_counting_iterator(local_degree),
-        transform_op,
-        init,
-        [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); });
+      *(result_value_output_first + idx) =
+        thrust::transform_reduce(thrust::seq,
+                                 thrust::make_counting_iterator(edge_t{0}),
+                                 thrust::make_counting_iterator(local_degree),
+                                 transform_op,
+                                 init,
+                                 edge_property_add);
     } else {
       thrust::for_each(
         thrust::seq,
@@ -147,44 +233,6 @@ __global__ void for_all_major_for_all_nbr_low_degree(
           atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
         });
     }
-#else
-    // FIXME: delete this once we verify that the code above is not slower than this.
-    e_op_result_t e_op_result_sum{init};  // relevent only if update_major == true
-    for (edge_t i = 0; i < local_degree; ++i) {
-      auto minor        = indices[i];
-      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
-      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
-      auto row          = GraphViewType::is_adj_matrix_transposed
-                   ? minor
-                   : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
-      auto col = GraphViewType::is_adj_matrix_transposed
-                   ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
-                   : minor;
-      auto row_offset = GraphViewType::is_adj_matrix_transposed
-                          ? minor_offset
-                          : static_cast<vertex_t>(major_offset);
-      auto col_offset = GraphViewType::is_adj_matrix_transposed
-                          ? static_cast<vertex_t>(major_offset)
-                          : minor_offset;
-      auto e_op_result = evaluate_edge_op<GraphViewType,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
-                                          EdgeOp>()
-                           .compute(row,
-                                    col,
-                                    weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
-                                    e_op);
-      if (update_major) {
-        accumulate_edge_op_result<update_major>(e_op_result_sum, e_op_result);
-      } else {
-        accumulate_edge_op_result<update_major>(*(result_value_output_first + minor_offset),
-                                                e_op_result);
-      }
-    }
-    if (update_major) { *(result_value_output_first + idx) = e_op_result_sum; }
-#endif
     idx += gridDim.x * blockDim.x;
   }
 }
@@ -197,7 +245,10 @@ template <bool update_major,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_mid_degree(
-  matrix_partition_device_t<GraphViewType> matrix_partition,
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
   AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
@@ -217,31 +268,33 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
   auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
   auto idx                = static_cast<size_t>(tid / raft::warp_size());
 
+  property_add<e_op_result_t> edge_property_add{};
   while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
     vertex_t const* indices{nullptr};
-    weight_t const* weights{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
     edge_t local_degree{};
-    auto major_offset                           = major_start_offset + idx;
     thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
     auto e_op_result_sum =
       lane_id == 0 ? init : e_op_result_t{};  // relevent only if update_major == true
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size) {
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
       auto minor        = indices[i];
-      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto weight       = weights ? (*weights)[i] : weight_t{1.0};
       auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
       auto row          = GraphViewType::is_adj_matrix_transposed
-                   ? minor
-                   : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
-      auto col = GraphViewType::is_adj_matrix_transposed
-                   ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
-                   : minor;
-      auto row_offset = GraphViewType::is_adj_matrix_transposed
-                          ? minor_offset
-                          : static_cast<vertex_t>(major_offset);
-      auto col_offset = GraphViewType::is_adj_matrix_transposed
-                          ? static_cast<vertex_t>(major_offset)
-                          : minor_offset;
-      auto e_op_result = evaluate_edge_op<GraphViewType,
+                            ? minor
+                            : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+      auto col          = GraphViewType::is_adj_matrix_transposed
+                            ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                            : minor;
+      auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+      auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
+      auto e_op_result  = evaluate_edge_op<GraphViewType,
+                                          vertex_t,
                                           AdjMatrixRowValueInputIterator,
                                           AdjMatrixColValueInputIterator,
                                           EdgeOp>()
@@ -252,7 +305,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
                                     *(adj_matrix_col_value_input_first + col_offset),
                                     e_op);
       if (update_major) {
-        e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result);
+        e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
       } else {
         atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
       }
@@ -274,7 +327,10 @@ template <bool update_major,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_high_degree(
-  matrix_partition_device_t<GraphViewType> matrix_partition,
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
   AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
@@ -291,31 +347,33 @@ __global__ void for_all_major_for_all_nbr_high_degree(
   auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
   auto idx                = static_cast<size_t>(blockIdx.x);
 
+  property_add<e_op_result_t> edge_property_add{};
   while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
     vertex_t const* indices{nullptr};
-    weight_t const* weights{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
     edge_t local_degree{};
-    auto major_offset                           = major_start_offset + idx;
     thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
     auto e_op_result_sum =
       threadIdx.x == 0 ? init : e_op_result_t{};  // relevent only if update_major == true
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
       auto minor        = indices[i];
-      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto weight       = weights ? (*weights)[i] : weight_t{1.0};
       auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
       auto row          = GraphViewType::is_adj_matrix_transposed
-                   ? minor
-                   : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
-      auto col = GraphViewType::is_adj_matrix_transposed
-                   ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
-                   : minor;
-      auto row_offset = GraphViewType::is_adj_matrix_transposed
-                          ? minor_offset
-                          : static_cast<vertex_t>(major_offset);
-      auto col_offset = GraphViewType::is_adj_matrix_transposed
-                          ? static_cast<vertex_t>(major_offset)
-                          : minor_offset;
-      auto e_op_result = evaluate_edge_op<GraphViewType,
+                            ? minor
+                            : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+      auto col          = GraphViewType::is_adj_matrix_transposed
+                            ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                            : minor;
+      auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+      auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
+      auto e_op_result  = evaluate_edge_op<GraphViewType,
+                                          vertex_t,
                                           AdjMatrixRowValueInputIterator,
                                           AdjMatrixColValueInputIterator,
                                           EdgeOp>()
@@ -326,7 +384,7 @@ __global__ void for_all_major_for_all_nbr_high_degree(
                                     *(adj_matrix_col_value_input_first + col_offset),
                                     e_op);
       if (update_major) {
-        e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result);
+        e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
       } else {
         atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
       }
@@ -357,38 +415,28 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                  T init,
                                  VertexValueOutputIterator vertex_value_output_first)
 {
-  using vertex_t = typename GraphViewType::vertex_type;
+  constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed);
+  using vertex_t              = typename GraphViewType::vertex_type;
+  using edge_t                = typename GraphViewType::edge_type;
+  using weight_t              = typename GraphViewType::weight_type;
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
-  auto loop_count = size_t{1};
-  if (GraphViewType::is_multi_gpu) {
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
-    loop_count               = graph_view.is_hypergraph_partitioned()
-                   ? graph_view.get_number_of_local_adj_matrix_partitions()
-                   : static_cast<size_t>(row_comm_size);
-  }
-  auto comm_rank = handle.comms_initialized() ? handle.get_comms().get_rank() : int{0};
-
   auto minor_tmp_buffer_size =
     (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed))
       ? GraphViewType::is_adj_matrix_transposed
           ? graph_view.get_number_of_local_adj_matrix_partition_rows()
           : graph_view.get_number_of_local_adj_matrix_partition_cols()
       : vertex_t{0};
-  auto minor_tmp_buffer   = allocate_comm_buffer<T>(minor_tmp_buffer_size, handle.get_stream());
-  auto minor_buffer_first = get_comm_buffer_begin<T>(minor_tmp_buffer);
+  auto minor_tmp_buffer = allocate_dataframe_buffer<T>(minor_tmp_buffer_size, handle.get_stream());
+  auto minor_buffer_first = get_dataframe_buffer_begin<T>(minor_tmp_buffer);
 
   if (in != GraphViewType::is_adj_matrix_transposed) {
     auto minor_init = init;
     if (GraphViewType::is_multi_gpu) {
       auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
       auto const row_comm_rank = row_comm.get_rank();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      minor_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{}
-                                                          : (col_comm_rank == 0) ? init : T{};
+      minor_init               = (row_comm_rank == 0) ? init : T{};
     }
 
     if (GraphViewType::is_multi_gpu) {
@@ -406,96 +454,139 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     assert(minor_tmp_buffer_size == 0);
   }
 
-  for (size_t i = 0; i < loop_count; ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(
-      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
-
-    auto major_tmp_buffer_size = vertex_t{0};
-    if (GraphViewType::is_multi_gpu) {
-      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(i));
 
-      major_tmp_buffer_size =
-        (in == GraphViewType::is_adj_matrix_transposed)
-          ? graph_view.is_hypergraph_partitioned()
-              ? matrix_partition.get_major_size()
-              : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)
-          : vertex_t{0};
-    }
-    auto major_tmp_buffer   = allocate_comm_buffer<T>(major_tmp_buffer_size, handle.get_stream());
-    auto major_buffer_first = get_comm_buffer_begin<T>(major_tmp_buffer);
+    auto major_tmp_buffer_size =
+      GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0};
+    auto major_tmp_buffer =
+      allocate_dataframe_buffer<T>(major_tmp_buffer_size, handle.get_stream());
+    auto major_buffer_first = get_dataframe_buffer_begin<T>(major_tmp_buffer);
 
     auto major_init = T{};
-    if (in == GraphViewType::is_adj_matrix_transposed) {
+    if (update_major) {
       if (GraphViewType::is_multi_gpu) {
-        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-        auto const row_comm_rank = row_comm.get_rank();
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
         auto const col_comm_rank = col_comm.get_rank();
-        major_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{}
-                                                            : (row_comm_rank == 0) ? init : T{};
+        major_init               = (col_comm_rank == 0) ? init : T{};
       } else {
         major_init = init;
       }
     }
 
-    int comm_root_rank = 0;
-    if (GraphViewType::is_multi_gpu) {
-      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
-                                                              : col_comm_rank * row_comm_size + i;
+    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                    ? vertex_t{0}
+                                    : matrix_partition.get_major_value_start_offset();
+    auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                    ? matrix_partition.get_major_value_start_offset()
+                                    : vertex_t{0};
+    std::conditional_t<
+      GraphViewType::is_multi_gpu,
+      std::conditional_t<update_major, decltype(major_buffer_first), decltype(minor_buffer_first)>,
+      VertexValueOutputIterator>
+      output_buffer_first{};
+    if constexpr (GraphViewType::is_multi_gpu) {
+      output_buffer_first = update_major ? major_buffer_first : minor_buffer_first;
+    } else {
+      output_buffer_first = vertex_value_output_first;
     }
-
-    if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
-      raft::grid_1d_thread_t update_grid(graph_view.get_vertex_partition_size(comm_root_rank),
+    auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+    if (segment_offsets) {
+      // FIXME: we may further improve performance by 1) concurrently running kernels on different
+      // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
+      // segment for very high degree vertices and running segmented reduction
+      static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+      if ((*segment_offsets)[1] > 0) {
+        raft::grid_1d_block_t update_grid((*segment_offsets)[1],
+                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_high_degree<update_major, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first(),
+            matrix_partition.get_major_first() + (*segment_offsets)[1],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            output_buffer_first,
+            e_op,
+            major_init);
+      }
+      if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+        raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
-
-      if (GraphViewType::is_multi_gpu) {
-        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-        auto const row_comm_size = row_comm.get_size();
-        auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-        auto const col_comm_rank = col_comm.get_rank();
-
-        auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                        ? vertex_t{0}
-                                        : matrix_partition.get_major_value_start_offset();
-        auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                        ? matrix_partition.get_major_value_start_offset()
-                                        : vertex_t{0};
-
-        detail::for_all_major_for_all_nbr_low_degree<in == GraphViewType::is_adj_matrix_transposed>
+        detail::for_all_major_for_all_nbr_mid_degree<update_major, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             matrix_partition,
-            graph_view.get_vertex_partition_first(comm_root_rank),
-            graph_view.get_vertex_partition_last(comm_root_rank),
+            matrix_partition.get_major_first() + (*segment_offsets)[1],
+            matrix_partition.get_major_first() + (*segment_offsets)[2],
             adj_matrix_row_value_input_first + row_value_input_offset,
             adj_matrix_col_value_input_first + col_value_input_offset,
-            (in == GraphViewType::is_adj_matrix_transposed) ? major_buffer_first
-                                                            : minor_buffer_first,
+            output_buffer_first + (update_major ? (*segment_offsets)[1] : vertex_t{0}),
             e_op,
             major_init);
-      } else {
-        detail::for_all_major_for_all_nbr_low_degree<in == GraphViewType::is_adj_matrix_transposed>
+      }
+      if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+        raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
+                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first() + (*segment_offsets)[2],
+            matrix_partition.get_major_first() + (*segment_offsets)[3],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            output_buffer_first + (update_major ? (*segment_offsets)[2] : vertex_t{0}),
+            e_op,
+            major_init);
+      }
+      if (matrix_partition.get_dcs_nzd_vertex_count()) {
+        if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
+                                       // hypersparse segment in
+                                       // for_all_major_for_all_nbr_hypersparse
+          thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       output_buffer_first + (*segment_offsets)[3],
+                       output_buffer_first + (*segment_offsets)[4],
+                       major_init);
+        }
+        if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) {
+          raft::grid_1d_thread_t update_grid(*(matrix_partition.get_dcs_nzd_vertex_count()),
+                                             detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                             handle.get_device_properties().maxGridSize[0]);
+          detail::for_all_major_for_all_nbr_hypersparse<update_major, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              matrix_partition,
+              matrix_partition.get_major_first() + (*segment_offsets)[3],
+              adj_matrix_row_value_input_first + row_value_input_offset,
+              adj_matrix_col_value_input_first + col_value_input_offset,
+              output_buffer_first + (update_major ? (*segment_offsets)[3] : vertex_t{0}),
+              e_op,
+              major_init);
+        }
+      }
+    } else {
+      if (matrix_partition.get_major_size() > 0) {
+        raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
+                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             matrix_partition,
-            graph_view.get_vertex_partition_first(comm_root_rank),
-            graph_view.get_vertex_partition_last(comm_root_rank),
-            adj_matrix_row_value_input_first,
-            adj_matrix_col_value_input_first,
-            vertex_value_output_first,
+            matrix_partition.get_major_first(),
+            matrix_partition.get_major_last(),
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            output_buffer_first,
             e_op,
             major_init);
       }
     }
 
-    if (GraphViewType::is_multi_gpu && (in == GraphViewType::is_adj_matrix_transposed)) {
+    if (GraphViewType::is_multi_gpu && update_major) {
+      auto& comm     = handle.get_comms();
       auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
       auto const row_comm_rank = row_comm.get_rank();
       auto const row_comm_size = row_comm.get_size();
@@ -503,35 +594,39 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       auto const col_comm_rank = col_comm.get_rank();
       auto const col_comm_size = col_comm.get_size();
 
-      if (graph_view.is_hypergraph_partitioned()) {
-        device_reduce(
-          col_comm,
-          major_buffer_first,
-          vertex_value_output_first,
-          static_cast<size_t>(graph_view.get_vertex_partition_size(i * row_comm_size + i)),
-          raft::comms::op_t::SUM,
-          i,
-          handle.get_stream());
-      } else {
-        device_reduce(row_comm,
-                      major_buffer_first,
-                      vertex_value_output_first,
-                      static_cast<size_t>(
-                        graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-      }
-    }
+      // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+      // two different communicators (beginning of col_comm)
+#if 1
+      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+      // and MPI barrier with MPI)
+      host_barrier(comm, handle.get_stream_view());
+#else
+      handle.get_stream_view().synchronize();
+      comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+      device_reduce(col_comm,
+                    major_buffer_first,
+                    vertex_value_output_first,
+                    matrix_partition.get_major_size(),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
 
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // this is as necessary major_tmp_buffer will become out-of-scope once
-                              // control flow exits this block (FIXME: we can reduce stream
-                              // synchronization if we compute the maximum major_tmp_buffer_size and
-                              // allocate major_tmp_buffer outside the loop)
+      // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+      // two different communicators (end of col_comm)
+#if 1
+      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+      // and MPI barrier with MPI)
+      host_barrier(comm, handle.get_stream_view());
+#else
+      handle.get_stream_view().synchronize();
+      comm.barrier();  // currently, this is ncclAllReduce
+#endif
+    }
   }
 
-  if (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) {
+  if (GraphViewType::is_multi_gpu && !update_major) {
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -541,59 +636,41 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      for (int i = 0; i < col_comm_size; ++i) {
-        auto offset = (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
-                       graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size));
-        auto size   = static_cast<size_t>(
-          graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i));
-        device_reduce(col_comm,
-                      minor_buffer_first + offset,
-                      minor_buffer_first + offset,
-                      size,
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-      }
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
 
-      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
-      // partitioning
-      auto comm_src_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      auto comm_dst_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      // FIXME: this branch may no longer necessary with NCCL backend
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        auto offset =
-          graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
-          graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
-        auto size = static_cast<size_t>(
-          graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank));
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     minor_buffer_first + offset,
-                     minor_buffer_first + offset + size,
-                     vertex_value_output_first);
-      } else {
-        device_sendrecv<decltype(minor_buffer_first), VertexValueOutputIterator>(
-          comm,
-          minor_buffer_first +
-            (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
-             graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
-          static_cast<size_t>(
-            graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)),
-          comm_dst_rank,
-          vertex_value_output_first,
-          static_cast<size_t>(graph_view.get_vertex_partition_size(comm_rank)),
-          comm_src_rank,
-          handle.get_stream());
-      }
+    for (int i = 0; i < row_comm_size; ++i) {
+      auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
+                     graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size));
+      device_reduce(row_comm,
+                    minor_buffer_first + offset,
+                    vertex_value_output_first,
+                    static_cast<size_t>(
+                      graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
     }
-  }
 
-  CUDA_TRY(cudaStreamSynchronize(
-    handle.get_stream()));  // this is as necessary minor_tmp_buffer will become out-of-scope once
-                            // control flow exits this block
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
 }
 
 }  // namespace detail
@@ -627,7 +704,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
  * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
  * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
  * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
- * @param init Initial value to be added to the reduced @e_op return values for each vertex.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
  * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
  * (exclusive) is deduced as @p vertex_value_output_first + @p
@@ -689,7 +766,7 @@ void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle,
  * adj_matrix_col_value_input_first + j) (where i is in [0,
  * graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
  * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
- * @param init Initial value to be added to the reduced @e_op return values for each vertex.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
  * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
  * (exclusive) is deduced as @p vertex_value_output_first + @p
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
new file mode 100644
index 00000000000..c504fa49526
--- /dev/null
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/utilities/collect_comm.cuh>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_barrier.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+#include <cugraph/utilities/shuffle_comm.cuh>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <cuco/static_map.cuh>
+#include <raft/handle.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 1024;
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename VertexIterator>
+struct minor_to_key_t {
+  using vertex_t = typename std::iterator_traits<VertexIterator>::value_type;
+  VertexIterator adj_matrix_col_key_first{};
+  vertex_t minor_first{};
+  __device__ vertex_t operator()(vertex_t minor)
+  {
+    return *(adj_matrix_col_key_first + (minor - minor_first));
+  }
+};
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+__global__ void for_all_major_for_all_nbr_mid_degree(
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t major_first,
+  vertex_t major_last,
+  vertex_t* majors)
+{
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(
+    copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size % raft::warp_size() == 0);
+  auto const lane_id      = tid % raft::warp_size();
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(tid / raft::warp_size());
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
+    auto local_offset                           = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+      majors[local_offset + i] = major;
+    }
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+__global__ void for_all_major_for_all_nbr_high_degree(
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t major_first,
+  vertex_t major_last,
+  vertex_t* majors)
+{
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(blockIdx.x);
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    auto local_offset = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      majors[local_offset + i] = major;
+    }
+    idx += gridDim.x;
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void decompress_matrix_partition_to_fill_edgelist_majors(
+  raft::handle_t const& handle,
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t* majors,
+  std::optional<std::vector<vertex_t>> const& segment_offsets)
+{
+  if (segment_offsets) {
+    // FIXME: we may further improve performance by 1) concurrently running kernels on different
+    // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
+    // segment for very high degree vertices and running segmented reduction
+    static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+    if ((*segment_offsets)[1] > 0) {
+      raft::grid_1d_block_t update_grid(
+        (*segment_offsets)[1],
+        detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
+        handle.get_device_properties().maxGridSize[0]);
+
+      detail::for_all_major_for_all_nbr_high_degree<<<update_grid.num_blocks,
+                                                      update_grid.block_size,
+                                                      0,
+                                                      handle.get_stream()>>>(
+        matrix_partition,
+        matrix_partition.get_major_first(),
+        matrix_partition.get_major_first() + (*segment_offsets)[1],
+        majors);
+    }
+    if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+      raft::grid_1d_warp_t update_grid(
+        (*segment_offsets)[2] - (*segment_offsets)[1],
+        detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
+        handle.get_device_properties().maxGridSize[0]);
+
+      detail::for_all_major_for_all_nbr_mid_degree<<<update_grid.num_blocks,
+                                                     update_grid.block_size,
+                                                     0,
+                                                     handle.get_stream()>>>(
+        matrix_partition,
+        matrix_partition.get_major_first() + (*segment_offsets)[1],
+        matrix_partition.get_major_first() + (*segment_offsets)[2],
+        majors);
+    }
+    if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+      thrust::for_each(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        thrust::make_counting_iterator(matrix_partition.get_major_first()) + (*segment_offsets)[2],
+        thrust::make_counting_iterator(matrix_partition.get_major_first()) + (*segment_offsets)[3],
+        [matrix_partition, majors] __device__(auto major) {
+          auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+          auto local_degree = matrix_partition.get_local_degree(major_offset);
+          auto local_offset = matrix_partition.get_local_offset(major_offset);
+          thrust::fill(
+            thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
+        });
+    }
+    if (matrix_partition.get_dcs_nzd_vertex_count() &&
+        (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
+      thrust::for_each(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        thrust::make_counting_iterator(vertex_t{0}),
+        thrust::make_counting_iterator(*(matrix_partition.get_dcs_nzd_vertex_count())),
+        [matrix_partition, major_start_offset = (*segment_offsets)[3], majors] __device__(
+          auto idx) {
+          auto major = *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck(idx));
+          auto major_idx =
+            major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+          auto local_degree = matrix_partition.get_local_degree(major_idx);
+          auto local_offset = matrix_partition.get_local_offset(major_idx);
+          thrust::fill(
+            thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
+        });
+    }
+  } else {
+    thrust::for_each(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      thrust::make_counting_iterator(matrix_partition.get_major_first()),
+      thrust::make_counting_iterator(matrix_partition.get_major_first()) +
+        matrix_partition.get_major_size(),
+      [matrix_partition, majors] __device__(auto major) {
+        auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+        auto local_degree = matrix_partition.get_local_degree(major_offset);
+        auto local_offset = matrix_partition.get_local_offset(major_offset);
+        thrust::fill(
+          thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
+      });
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Iterate over every vertex's key-aggregated outgoing edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transfrom_reduce() (iteration over the outgoing edges
+ * part) and thrust::copy() (update vertex properties part, take transform_reduce output as copy
+ * input).
+ * Unlike copy_v_transform_reduce_out_nbr, this function first aggregates outgoing edges by key to
+ * support two level reduction for every vertex.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam VertexIterator Type of the iterator for graph adjacency matrix column key values for
+ * aggregation (key type should coincide with vertex type).
+ * @tparam ValueIterator Type of the iterator for values in (key, value) pairs.
+ * @tparam KeyAggregatedEdgeOp Type of the quinary key-aggregated edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam T Type of the initial value for reduction over the key-aggregated outgoing edges.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key (for
+ * aggregation) for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p
+ * graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param map_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
+ * (assigned to this process in multi-GPU,
+ * `cugraph::experimental::detail::compute_gpu_id_from_vertex_t` is used to map keys to processes).
+ * (Key, value) pairs may be provided by transform_reduce_by_adj_matrix_row_key_e() or
+ * transform_reduce_by_adj_matrix_col_key_e().
+ * @param map_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs (assigned
+ * to this process in multi-GPU).
+ * @param map_value_first Iterator pointing to the first (inclusive) value in (key, value) pairs
+ * (assigned to this process in multi-GPU). `map_value_last` (exclusive) is deduced as @p
+ * map_value_first + thrust::distance(@p map_key_first, @p map_key_last).
+ * @param key_aggregated_e_op Quinary operator takes edge source, key, aggregated edge weight, *(@p
+ * adj_matrix_row_value_input_first + i), and value for the key stored in the input (key, value)
+ * pairs provided by @p map_key_first, @p map_key_last, and @p map_value_first (aggregated over the
+ * entire set of processes in multi-GPU).
+ * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
+ * @param init Initial value to be added to the reduced @p reduce_op return values for each vertex.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
+ * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.get_number_of_local_vertices().
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename VertexIterator0,
+          typename VertexIterator1,
+          typename ValueIterator,
+          typename KeyAggregatedEdgeOp,
+          typename ReduceOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void copy_v_transform_reduce_key_aggregated_out_nbr(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  VertexIterator0 adj_matrix_col_key_first,
+  VertexIterator1 map_key_first,
+  VertexIterator1 map_key_last,
+  ValueIterator map_value_first,
+  KeyAggregatedEdgeOp key_aggregated_e_op,
+  ReduceOp reduce_op,
+  T init,
+  VertexValueOutputIterator vertex_value_output_first)
+{
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator0>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator0>::value_type,
+                             typename std::iterator_traits<VertexIterator1>::value_type>::value);
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using value_t  = typename std::iterator_traits<ValueIterator>::value_type;
+
+  double constexpr load_factor = 0.7;
+
+  // 1. build a cuco::static_map object for the k, v pairs.
+
+  auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+  auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+  auto kv_map_ptr     = std::make_unique<
+    cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+    size_t{0},
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value,
+    stream_adapter);
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    auto map_counts =
+      host_scalar_allgather(row_comm,
+                            static_cast<size_t>(thrust::distance(map_key_first, map_key_last)),
+                            handle.get_stream());
+    std::vector<size_t> map_displacements(row_comm_size, size_t{0});
+    std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1);
+    rmm::device_uvector<vertex_t> map_keys(map_displacements.back() + map_counts.back(),
+                                           handle.get_stream());
+    auto map_value_buffer =
+      allocate_dataframe_buffer<value_t>(map_keys.size(), handle.get_stream());
+    for (int i = 0; i < row_comm_size; ++i) {
+      device_bcast(row_comm,
+                   map_key_first,
+                   map_keys.begin() + map_displacements[i],
+                   map_counts[i],
+                   i,
+                   handle.get_stream());
+      device_bcast(row_comm,
+                   map_value_first,
+                   get_dataframe_buffer_begin<value_t>(map_value_buffer) + map_displacements[i],
+                   map_counts[i],
+                   i,
+                   handle.get_stream());
+    }
+    // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input &
+    // output pointers
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 map_key_first,
+                 map_key_last,
+                 map_keys.begin() + map_displacements[row_comm_rank]);
+    thrust::copy(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      map_value_first,
+      map_value_first + thrust::distance(map_key_first, map_key_last),
+      get_dataframe_buffer_begin<value_t>(map_value_buffer) + map_displacements[row_comm_rank]);
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    kv_map_ptr.reset();
+
+    kv_map_ptr = std::make_unique<
+      cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+      // cuco::static_map requires at least one empty slot
+      std::max(static_cast<size_t>(static_cast<double>(map_keys.size()) / load_factor),
+               static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value,
+      stream_adapter);
+
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(map_keys.begin(), get_dataframe_buffer_begin<value_t>(map_value_buffer)));
+    kv_map_ptr->insert(pair_first, pair_first + map_keys.size());
+  } else {
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    kv_map_ptr.reset();
+
+    kv_map_ptr = std::make_unique<
+      cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+      // cuco::static_map requires at least one empty slot
+      std::max(static_cast<size_t>(
+                 static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
+               static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value,
+      stream_adapter);
+
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first));
+    kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+  }
+
+  // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  rmm::device_uvector<vertex_t> major_vertices(0, handle.get_stream());
+  auto e_op_result_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(i));
+
+    rmm::device_uvector<vertex_t> tmp_major_vertices(matrix_partition.get_number_of_edges(),
+                                                     handle.get_stream());
+    rmm::device_uvector<vertex_t> tmp_minor_keys(tmp_major_vertices.size(), handle.get_stream());
+    rmm::device_uvector<weight_t> tmp_key_aggregated_edge_weights(
+      graph_view.is_weighted() ? tmp_major_vertices.size() : size_t{0}, handle.get_stream());
+
+    if (matrix_partition.get_major_size() > 0) {
+      auto minor_key_first = thrust::make_transform_iterator(
+        matrix_partition.get_indices(),
+        detail::minor_to_key_t<VertexIterator0>{adj_matrix_col_key_first,
+                                                matrix_partition.get_minor_first()});
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   minor_key_first,
+                   minor_key_first + matrix_partition.get_number_of_edges(),
+                   tmp_minor_keys.begin());
+      if (graph_view.is_weighted()) {
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     *(matrix_partition.get_weights()),
+                     *(matrix_partition.get_weights()) + matrix_partition.get_number_of_edges(),
+                     tmp_key_aggregated_edge_weights.begin());
+      }
+      detail::decompress_matrix_partition_to_fill_edgelist_majors(
+        handle,
+        matrix_partition,
+        tmp_major_vertices.data(),
+        graph_view.get_local_adj_matrix_partition_segment_offsets(i));
+      rmm::device_uvector<vertex_t> reduced_major_vertices(tmp_major_vertices.size(),
+                                                           handle.get_stream());
+      rmm::device_uvector<vertex_t> reduced_minor_keys(reduced_major_vertices.size(),
+                                                       handle.get_stream());
+      rmm::device_uvector<weight_t> reduced_key_aggregated_edge_weights(
+        reduced_major_vertices.size(), handle.get_stream());
+      size_t reduced_size{};
+      // FIXME: cub segmented sort may be more efficient as this is already sorted by major
+      auto input_key_first = thrust::make_zip_iterator(
+        thrust::make_tuple(tmp_major_vertices.begin(), tmp_minor_keys.begin()));
+      auto output_key_first = thrust::make_zip_iterator(
+        thrust::make_tuple(reduced_major_vertices.begin(), reduced_minor_keys.begin()));
+      if (graph_view.is_weighted()) {
+        thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                            input_key_first,
+                            input_key_first + tmp_major_vertices.size(),
+                            tmp_key_aggregated_edge_weights.begin());
+        reduced_size =
+          thrust::distance(output_key_first,
+                           thrust::get<0>(thrust::reduce_by_key(
+                             rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                             input_key_first,
+                             input_key_first + tmp_major_vertices.size(),
+                             tmp_key_aggregated_edge_weights.begin(),
+                             output_key_first,
+                             reduced_key_aggregated_edge_weights.begin())));
+      } else {
+        thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     input_key_first,
+                     input_key_first + tmp_major_vertices.size());
+        reduced_size =
+          thrust::distance(output_key_first,
+                           thrust::get<0>(thrust::reduce_by_key(
+                             rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                             input_key_first,
+                             input_key_first + tmp_major_vertices.size(),
+                             thrust::make_constant_iterator(weight_t{1.0}),
+                             output_key_first,
+                             reduced_key_aggregated_edge_weights.begin())));
+      }
+      tmp_major_vertices              = std::move(reduced_major_vertices);
+      tmp_minor_keys                  = std::move(reduced_minor_keys);
+      tmp_key_aggregated_edge_weights = std::move(reduced_key_aggregated_edge_weights);
+      tmp_major_vertices.resize(reduced_size, handle.get_stream());
+      tmp_minor_keys.resize(tmp_major_vertices.size(), handle.get_stream());
+      tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream());
+      tmp_major_vertices.shrink_to_fit(handle.get_stream());
+      tmp_minor_keys.shrink_to_fit(handle.get_stream());
+      tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream());
+    }
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_size = row_comm.get_size();
+
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_size = col_comm.get_size();
+
+      auto triplet_first =
+        thrust::make_zip_iterator(thrust::make_tuple(tmp_major_vertices.begin(),
+                                                     tmp_minor_keys.begin(),
+                                                     tmp_key_aggregated_edge_weights.begin()));
+      rmm::device_uvector<vertex_t> rx_major_vertices(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> rx_minor_keys(0, handle.get_stream());
+      rmm::device_uvector<weight_t> rx_key_aggregated_edge_weights(0, handle.get_stream());
+      std::forward_as_tuple(
+        std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) =
+        groupby_gpuid_and_shuffle_values(
+          col_comm,
+          triplet_first,
+          triplet_first + tmp_major_vertices.size(),
+          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
+           row_comm_size] __device__(auto val) {
+            return key_func(thrust::get<1>(val)) / row_comm_size;
+          },
+          handle.get_stream());
+
+      auto pair_first = thrust::make_zip_iterator(
+        thrust::make_tuple(rx_major_vertices.begin(), rx_minor_keys.begin()));
+      thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          pair_first,
+                          pair_first + rx_major_vertices.size(),
+                          rx_key_aggregated_edge_weights.begin());
+      tmp_major_vertices.resize(rx_major_vertices.size(), handle.get_stream());
+      tmp_minor_keys.resize(tmp_major_vertices.size(), handle.get_stream());
+      tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream());
+      auto pair_it =
+        thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                              pair_first,
+                              pair_first + rx_major_vertices.size(),
+                              rx_key_aggregated_edge_weights.begin(),
+                              thrust::make_zip_iterator(thrust::make_tuple(
+                                tmp_major_vertices.begin(), tmp_minor_keys.begin())),
+                              tmp_key_aggregated_edge_weights.begin());
+      tmp_major_vertices.resize(
+        thrust::distance(tmp_key_aggregated_edge_weights.begin(), thrust::get<1>(pair_it)),
+        handle.get_stream());
+      tmp_minor_keys.resize(tmp_major_vertices.size(), handle.get_stream());
+      tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream());
+      tmp_major_vertices.shrink_to_fit(handle.get_stream());
+      tmp_minor_keys.shrink_to_fit(handle.get_stream());
+      tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream());
+    }
+
+    auto tmp_e_op_result_buffer =
+      allocate_dataframe_buffer<T>(tmp_major_vertices.size(), handle.get_stream());
+    auto tmp_e_op_result_buffer_first = get_dataframe_buffer_begin<T>(tmp_e_op_result_buffer);
+
+    auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
+      tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin()));
+    thrust::transform(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      triplet_first,
+      triplet_first + tmp_major_vertices.size(),
+      tmp_e_op_result_buffer_first,
+      [adj_matrix_row_value_input_first =
+         adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(),
+       key_aggregated_e_op,
+       matrix_partition,
+       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
+        auto major = thrust::get<0>(val);
+        auto key   = thrust::get<1>(val);
+        auto w     = thrust::get<2>(val);
+        return key_aggregated_e_op(major,
+                                   key,
+                                   w,
+                                   *(adj_matrix_row_value_input_first +
+                                     matrix_partition.get_major_offset_from_major_nocheck(major)),
+                                   kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
+      });
+    tmp_minor_keys.resize(0, handle.get_stream());
+    tmp_key_aggregated_edge_weights.resize(0, handle.get_stream());
+    tmp_minor_keys.shrink_to_fit(handle.get_stream());
+    tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream());
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      // FIXME: additional optimization is possible if reduce_op is a pure function (and reduce_op
+      // can be mapped to ncclRedOp_t).
+
+      auto rx_sizes =
+        host_scalar_gather(col_comm, tmp_major_vertices.size(), i, handle.get_stream());
+      std::vector<size_t> rx_displs{};
+      rmm::device_uvector<vertex_t> rx_major_vertices(0, handle.get_stream());
+      if (static_cast<size_t>(col_comm_rank) == i) {
+        rx_displs.assign(col_comm_size, size_t{0});
+        std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+        rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream());
+      }
+      auto rx_tmp_e_op_result_buffer =
+        allocate_dataframe_buffer<T>(rx_major_vertices.size(), handle.get_stream());
+
+      device_gatherv(col_comm,
+                     tmp_major_vertices.data(),
+                     rx_major_vertices.data(),
+                     tmp_major_vertices.size(),
+                     rx_sizes,
+                     rx_displs,
+                     i,
+                     handle.get_stream());
+      device_gatherv(col_comm,
+                     tmp_e_op_result_buffer_first,
+                     get_dataframe_buffer_begin<T>(rx_tmp_e_op_result_buffer),
+                     tmp_major_vertices.size(),
+                     rx_sizes,
+                     rx_displs,
+                     i,
+                     handle.get_stream());
+
+      if (static_cast<size_t>(col_comm_rank) == i) {
+        major_vertices     = std::move(rx_major_vertices);
+        e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer);
+      }
+    } else {
+      major_vertices     = std::move(tmp_major_vertices);
+      e_op_result_buffer = std::move(tmp_e_op_result_buffer);
+    }
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               vertex_value_output_first,
+               vertex_value_output_first + graph_view.get_number_of_local_vertices(),
+               T{});
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      major_vertices.begin(),
+                      major_vertices.end(),
+                      get_dataframe_buffer_begin<T>(e_op_result_buffer));
+
+  auto num_uniques = thrust::count_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(major_vertices.size()),
+    [major_vertices = major_vertices.data()] __device__(auto i) {
+      return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false;
+    });
+  rmm::device_uvector<vertex_t> unique_major_vertices(num_uniques, handle.get_stream());
+
+  auto major_vertex_first = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(size_t{0}),
+    [major_vertices = major_vertices.data()] __device__(auto i) {
+      return ((i == 0) || (major_vertices[i] != major_vertices[i - 1]))
+               ? major_vertices[i]
+               : invalid_vertex_id<vertex_t>::value;
+    });
+  thrust::copy_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    major_vertex_first,
+    major_vertex_first + major_vertices.size(),
+    unique_major_vertices.begin(),
+    [] __device__(auto major) { return major != invalid_vertex_id<vertex_t>::value; });
+  thrust::reduce_by_key(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    major_vertices.begin(),
+    major_vertices.end(),
+    get_dataframe_buffer_begin<T>(e_op_result_buffer),
+    thrust::make_discard_iterator(),
+    thrust::make_permutation_iterator(
+      vertex_value_output_first,
+      thrust::make_transform_iterator(
+        unique_major_vertices.begin(),
+        [vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+           graph_view.get_vertex_partition_view())] __device__(auto v) {
+          return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
+        })),
+    thrust::equal_to<vertex_t>{},
+    reduce_op);
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    vertex_value_output_first,
+                    vertex_value_output_first + graph_view.get_number_of_local_vertices(),
+                    vertex_value_output_first,
+                    [reduce_op, init] __device__(auto val) { return reduce_op(val, init); });
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/count_if_e.cuh b/cpp/include/cugraph/prims/count_if_e.cuh
new file mode 100644
index 00000000000..cfbb81d9bc9
--- /dev/null
+++ b/cpp/include/cugraph/prims/count_if_e.cuh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/property_op_utils.cuh>
+#include <cugraph/prims/transform_reduce_e.cuh>
+
+#include <raft/handle.hpp>
+
+#include <cstdint>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief Count the number of edges that satisfies the given predicate.
+ *
+ * This function is inspired by thrust::count_if().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns true if this edge should be
+ * included in the returned count.
+ * @return GraphViewType::edge_type Number of times @p e_op returned true.
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp>
+typename GraphViewType::edge_type count_if_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  return transform_reduce_e(handle,
+                            graph_view,
+                            adj_matrix_row_value_input_first,
+                            adj_matrix_col_value_input_first,
+                            cast_edge_op_bool_to_integer<GraphViewType,
+                                                         vertex_t,
+                                                         AdjMatrixRowValueInputIterator,
+                                                         AdjMatrixColValueInputIterator,
+                                                         EdgeOp,
+                                                         edge_t>{e_op},
+                            edge_t{0});
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/count_if_v.cuh b/cpp/include/cugraph/prims/count_if_v.cuh
similarity index 96%
rename from cpp/include/patterns/count_if_v.cuh
rename to cpp/include/cugraph/prims/count_if_v.cuh
index c90b259cdde..ef49a3e463b 100644
--- a/cpp/include/patterns/count_if_v.cuh
+++ b/cpp/include/cugraph/prims/count_if_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
diff --git a/cpp/include/patterns/edge_op_utils.cuh b/cpp/include/cugraph/prims/property_op_utils.cuh
similarity index 58%
rename from cpp/include/patterns/edge_op_utils.cuh
rename to cpp/include/cugraph/prims/property_op_utils.cuh
index 58fb31c7605..ec3ed788cc1 100644
--- a/cpp/include/patterns/edge_op_utils.cuh
+++ b/cpp/include/cugraph/prims/property_op_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <utilities/thrust_tuple_utils.cuh>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
 
 #include <raft/device_atomics.cuh>
 
@@ -30,19 +30,20 @@
 namespace cugraph {
 namespace experimental {
 
-template <typename ResultOfEdgeOp, typename Enable = void>
+template <typename InvokeResultEdgeOp, typename Enable = void>
 struct is_valid_edge_op {
   static constexpr bool value = false;
 };
 
-template <typename ResultOfEdgeOp>
+template <typename InvokeResultEdgeOp>
 struct is_valid_edge_op<
-  ResultOfEdgeOp,
-  typename std::conditional<false, typename ResultOfEdgeOp::type, void>::type> {
+  InvokeResultEdgeOp,
+  typename std::conditional_t<false, typename InvokeResultEdgeOp::type, void>> {
   static constexpr bool valid = true;
 };
 
 template <typename GraphViewType,
+          typename key_t,
           typename AdjMatrixRowValueInputIterator,
           typename AdjMatrixColValueInputIterator,
           typename EdgeOp>
@@ -52,44 +53,98 @@ struct evaluate_edge_op {
   using row_value_type = typename std::iterator_traits<AdjMatrixRowValueInputIterator>::value_type;
   using col_value_type = typename std::iterator_traits<AdjMatrixColValueInputIterator>::value_type;
 
-  template <typename V = vertex_type,
+  template <typename K = key_t,
+            typename V = vertex_type,
             typename W = weight_type,
             typename R = row_value_type,
             typename C = col_value_type,
             typename E = EdgeOp>
-  __device__ std::enable_if_t<is_valid_edge_op<typename std::result_of<E(V, V, W, R, C)>>::valid,
-                              typename std::result_of<E(V, V, W, R, C)>::type>
-  compute(V r, V c, W w, R rv, C cv, E e)
+  __device__
+    std::enable_if_t<is_valid_edge_op<typename std::invoke_result<E, K, V, W, R, C>>::valid,
+                     typename std::invoke_result<E, K, V, W, R, C>::type>
+    compute(K r, V c, W w, R rv, C cv, E e)
   {
     return e(r, c, w, rv, cv);
   }
 
-  template <typename V = vertex_type,
+  template <typename K = key_t,
+            typename V = vertex_type,
             typename W = weight_type,
             typename R = row_value_type,
             typename C = col_value_type,
             typename E = EdgeOp>
-  __device__ std::enable_if_t<is_valid_edge_op<typename std::result_of<E(V, V, R, C)>>::valid,
-                              typename std::result_of<E(V, V, R, C)>::type>
-  compute(V r, V c, W w, R rv, C cv, E e)
+  __device__ std::enable_if_t<is_valid_edge_op<typename std::invoke_result<E, K, V, R, C>>::valid,
+                              typename std::invoke_result<E, K, V, R, C>::type>
+  compute(K r, V c, W w, R rv, C cv, E e)
   {
     return e(r, c, rv, cv);
   }
 };
 
-template <typename T>
-__host__ __device__ std::enable_if_t<std::is_arithmetic<T>::value, T> plus_edge_op_result(
-  T const& lhs, T const& rhs)
-{
-  return lhs + rhs;
-}
+template <typename GraphViewType,
+          typename key_t,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename T>
+struct cast_edge_op_bool_to_integer {
+  static_assert(std::is_integral<T>::value);
+  using vertex_type    = typename GraphViewType::vertex_type;
+  using weight_type    = typename GraphViewType::weight_type;
+  using row_value_type = typename std::iterator_traits<AdjMatrixRowValueInputIterator>::value_type;
+  using col_value_type = typename std::iterator_traits<AdjMatrixColValueInputIterator>::value_type;
+
+  EdgeOp e_op{};
+
+  template <typename K = key_t,
+            typename V = vertex_type,
+            typename W = weight_type,
+            typename R = row_value_type,
+            typename C = col_value_type,
+            typename E = EdgeOp>
+  __device__
+    std::enable_if_t<is_valid_edge_op<typename std::invoke_result<E, K, V, W, R, C>>::valid, T>
+    operator()(K r, V c, W w, R rv, C cv)
+  {
+    return e_op(r, c, w, rv, cv) ? T{1} : T{0};
+  }
+
+  template <typename K = key_t,
+            typename V = vertex_type,
+            typename R = row_value_type,
+            typename C = col_value_type,
+            typename E = EdgeOp>
+  __device__
+    std::enable_if_t<is_valid_edge_op<typename std::invoke_result<E, K, V, R, C>>::valid, T>
+    operator()(K r, V c, R rv, C cv)
+  {
+    return e_op(r, c, rv, cv) ? T{1} : T{0};
+  }
+};
 
 template <typename T>
-__host__ __device__ std::enable_if_t<is_thrust_tuple<T>::value, T> plus_edge_op_result(T const& lhs,
-                                                                                       T const& rhs)
-{
-  return plus_thrust_tuple<T>()(lhs, rhs);
-}
+struct property_add : public thrust::plus<T> {
+};
+
+template <typename... Args>
+struct property_add<thrust::tuple<Args...>>
+  : public thrust::
+      binary_function<thrust::tuple<Args...>, thrust::tuple<Args...>, thrust::tuple<Args...>> {
+  using Type = thrust::tuple<Args...>;
+
+ private:
+  template <typename T, std::size_t... I>
+  __device__ constexpr auto sum_impl(T& t1, T& t2, std::index_sequence<I...>)
+  {
+    return thrust::make_tuple((thrust::get<I>(t1) + thrust::get<I>(t2))...);
+  }
+
+ public:
+  __device__ constexpr auto operator()(const Type& t1, const Type& t2)
+  {
+    return sum_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<Type>::value>());
+  }
+};
 
 template <typename Iterator, typename T>
 __device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
diff --git a/cpp/include/patterns/reduce_op.cuh b/cpp/include/cugraph/prims/reduce_op.cuh
similarity index 61%
rename from cpp/include/patterns/reduce_op.cuh
rename to cpp/include/cugraph/prims/reduce_op.cuh
index e9011914292..e73a2861cb0 100644
--- a/cpp/include/patterns/reduce_op.cuh
+++ b/cpp/include/cugraph/prims/reduce_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,20 +20,34 @@ namespace cugraph {
 namespace experimental {
 namespace reduce_op {
 
+// in case there is no payload to reduce
+struct null {
+  using type = void;
+};
+
 // reducing N elements, any element can be a valid output.
 template <typename T>
 struct any {
-  using type                          = T;
+  using type = T;
+  // FIXME: actually every reduction operation should be side-effect free if reduction is performed
+  // by thrust; thrust reduction call rounds up the number of invocations based on the block size
+  // and discards the values outside the valid range; this does not work if the reduction operation
+  // has side-effects.
   static constexpr bool pure_function = true;  // this can be called in any process
 
   __host__ __device__ T operator()(T const& lhs, T const& rhs) const { return lhs; }
 };
 
+// FIXME: thrust::minimum can replace this.
 // reducing N elements (operator < should be defined between any two elements), the minimum element
 // should be selected.
 template <typename T>
 struct min {
-  using type                          = T;
+  using type = T;
+  // FIXME: actually every reduction operation should be side-effect free if reduction is performed
+  // by thrust; thrust reduction call rounds up the number of invocations based on the block size
+  // and discards the values outside the valid range; this does not work if the reduction operation
+  // has side-effects.
   static constexpr bool pure_function = true;  // this can be called in any process
 
   __host__ __device__ T operator()(T const& lhs, T const& rhs) const
diff --git a/cpp/include/patterns/reduce_v.cuh b/cpp/include/cugraph/prims/reduce_v.cuh
similarity index 83%
rename from cpp/include/patterns/reduce_v.cuh
rename to cpp/include/cugraph/prims/reduce_v.cuh
index 12224dc55f4..c7c504942d1 100644
--- a/cpp/include/patterns/reduce_v.cuh
+++ b/cpp/include/cugraph/prims/reduce_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,10 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/property_op_utils.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
 
 #include <raft/handle.hpp>
 
@@ -51,10 +52,12 @@ T reduce_v(raft::handle_t const& handle,
            VertexValueInputIterator vertex_value_input_first,
            T init)
 {
-  auto ret = thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                            vertex_value_input_first,
-                            vertex_value_input_first + graph_view.get_number_of_local_vertices(),
-                            init);
+  auto ret = thrust::reduce(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    vertex_value_input_first,
+    vertex_value_input_first + graph_view.get_number_of_local_vertices(),
+    ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{},
+    property_add<T>());
   if (GraphViewType::is_multi_gpu) {
     ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
   }
@@ -87,7 +90,11 @@ T reduce_v(raft::handle_t const& handle,
            T init)
 {
   auto ret = thrust::reduce(
-    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), input_first, input_last, init);
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    input_first,
+    input_last,
+    ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{},
+    property_add<T>());
   if (GraphViewType::is_multi_gpu) {
     ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
   }
diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
new file mode 100644
index 00000000000..c1887433fd1
--- /dev/null
+++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/prims/property_op_utils.cuh>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <raft/handle.hpp>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size = 128;
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+__device__ void update_buffer_element(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu>& matrix_partition,
+  typename GraphViewType::vertex_type major,
+  typename GraphViewType::vertex_type minor,
+  typename GraphViewType::weight_type weight,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  typename GraphViewType::vertex_type* key,
+  T* value)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+  auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+  auto row          = GraphViewType::is_adj_matrix_transposed ? minor : major;
+  auto col          = GraphViewType::is_adj_matrix_transposed ? major : minor;
+  auto row_offset   = GraphViewType::is_adj_matrix_transposed ? minor_offset : major_offset;
+  auto col_offset   = GraphViewType::is_adj_matrix_transposed ? major_offset : minor_offset;
+
+  *key   = *(adj_matrix_row_col_key_first +
+           ((GraphViewType::is_adj_matrix_transposed != adj_matrix_row_key) ? major_offset
+                                                                              : minor_offset));
+  *value = evaluate_edge_op<GraphViewType,
+                            vertex_t,
+                            AdjMatrixRowValueInputIterator,
+                            AdjMatrixColValueInputIterator,
+                            EdgeOp>()
+             .compute(row,
+                      col,
+                      weight,
+                      *(adj_matrix_row_value_input_first + row_offset),
+                      *(adj_matrix_col_value_input_first + col_offset),
+                      e_op);
+}
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_hypersparse(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_hypersparse_first,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  typename GraphViewType::vertex_type* keys,
+  T* values)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset =
+    static_cast<size_t>(major_hypersparse_first - matrix_partition.get_major_first());
+  auto idx = static_cast<size_t>(tid);
+
+  auto dcs_nzd_vertex_count = *(matrix_partition.get_dcs_nzd_vertex_count());
+
+  while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
+    auto major =
+      *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_idx =
+      major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_idx));
+    auto local_offset = matrix_partition.get_local_offset(major_idx);
+    for (edge_t i = 0; i < local_degree; ++i) {
+      update_buffer_element<adj_matrix_row_key, GraphViewType>(
+        matrix_partition,
+        major,
+        indices[i],
+        weights ? (*weights)[i] : weight_t{1.0},
+        adj_matrix_row_value_input_first,
+        adj_matrix_col_value_input_first,
+        adj_matrix_row_col_key_first,
+        e_op,
+        keys + local_offset + i,
+        values + local_offset + i);
+    }
+
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  typename GraphViewType::vertex_type* keys,
+  T* values)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  auto idx                = static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    auto local_offset = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = 0; i < local_degree; ++i) {
+      update_buffer_element<adj_matrix_row_key, GraphViewType>(
+        matrix_partition,
+        major,
+        indices[i],
+        weights ? (*weights)[i] : weight_t{1.0},
+        adj_matrix_row_value_input_first,
+        adj_matrix_col_value_input_first,
+        adj_matrix_row_col_key_first,
+        e_op,
+        keys + local_offset + i,
+        values + local_offset + i);
+    }
+
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_mid_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  typename GraphViewType::vertex_type* keys,
+  T* values)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(
+    transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size % raft::warp_size() == 0);
+  auto const lane_id      = tid % raft::warp_size();
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(tid / raft::warp_size());
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    auto local_offset = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+      update_buffer_element<adj_matrix_row_key, GraphViewType>(
+        matrix_partition,
+        major,
+        indices[i],
+        weights ? (*weights)[i] : weight_t{1.0},
+        adj_matrix_row_value_input_first,
+        adj_matrix_col_value_input_first,
+        adj_matrix_row_col_key_first,
+        e_op,
+        keys + local_offset + i,
+        values + local_offset + i);
+    }
+
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_high_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  typename GraphViewType::vertex_type* keys,
+  T* values)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  auto idx                = static_cast<size_t>(blockIdx.x);
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    auto local_offset = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      update_buffer_element<adj_matrix_row_key, GraphViewType>(
+        matrix_partition,
+        major,
+        indices[i],
+        weights ? (*weights)[i] : weight_t{1.0},
+        adj_matrix_row_value_input_first,
+        adj_matrix_col_value_input_first,
+        adj_matrix_row_col_key_first,
+        e_op,
+        keys + local_offset + i,
+        values + local_offset + i);
+    }
+
+    idx += gridDim.x;
+  }
+}
+
+// FIXME: better derive value_t from BufferType
+template <typename vertex_t, typename value_t, typename BufferType>
+std::tuple<rmm::device_uvector<vertex_t>, BufferType> reduce_to_unique_kv_pairs(
+  rmm::device_uvector<vertex_t>&& keys, BufferType&& value_buffer, cudaStream_t stream)
+{
+  thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                      keys.begin(),
+                      keys.end(),
+                      get_dataframe_buffer_begin<value_t>(value_buffer));
+  auto num_uniques =
+    thrust::count_if(rmm::exec_policy(stream)->on(stream),
+                     thrust::make_counting_iterator(size_t{0}),
+                     thrust::make_counting_iterator(keys.size()),
+                     [keys = keys.data()] __device__(auto i) {
+                       return ((i == 0) || (keys[i] != keys[i - 1])) ? true : false;
+                     });
+
+  rmm::device_uvector<vertex_t> unique_keys(num_uniques, stream);
+  auto value_for_unique_key_buffer = allocate_dataframe_buffer<value_t>(unique_keys.size(), stream);
+  thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                        keys.begin(),
+                        keys.end(),
+                        get_dataframe_buffer_begin<value_t>(value_buffer),
+                        unique_keys.begin(),
+                        get_dataframe_buffer_begin<value_t>(value_for_unique_key_buffer));
+
+  return std::make_tuple(std::move(unique_keys), std::move(value_for_unique_key_buffer));
+}
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+           decltype(allocate_dataframe_buffer<T>(0, cudaStream_t{nullptr}))>
+transform_reduce_by_adj_matrix_row_col_key_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  rmm::device_uvector<vertex_t> keys(0, handle.get_stream());
+  auto value_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(i));
+
+    int comm_root_rank = 0;
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      comm_root_rank           = i * row_comm_size + row_comm_rank;
+    }
+
+    auto num_edges = matrix_partition.get_number_of_edges();
+
+    rmm::device_uvector<vertex_t> tmp_keys(num_edges, handle.get_stream());
+    auto tmp_value_buffer = allocate_dataframe_buffer<T>(tmp_keys.size(), handle.get_stream());
+
+    if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
+      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                      ? vertex_t{0}
+                                      : matrix_partition.get_major_value_start_offset();
+      auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                      ? matrix_partition.get_major_value_start_offset()
+                                      : vertex_t{0};
+      auto segment_offsets        = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+      if (segment_offsets) {
+        // FIXME: we may further improve performance by 1) concurrently running kernels on different
+        // segments; 2) individually tuning block sizes for different segments; and 3) adding one
+        // more segment for very high degree vertices and running segmented reduction
+        static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+        if ((*segment_offsets)[1] > 0) {
+          raft::grid_1d_block_t update_grid(
+            (*segment_offsets)[1],
+            detail::transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size,
+            handle.get_device_properties().maxGridSize[0]);
+          detail::for_all_major_for_all_nbr_high_degree<adj_matrix_row_key, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              matrix_partition,
+              matrix_partition.get_major_first(),
+              matrix_partition.get_major_first() + (*segment_offsets)[1],
+              adj_matrix_row_value_input_first + row_value_input_offset,
+              adj_matrix_col_value_input_first + col_value_input_offset,
+              adj_matrix_row_col_key_first +
+                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              e_op,
+              tmp_keys.data(),
+              get_dataframe_buffer_begin<T>(tmp_value_buffer));
+        }
+        if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+          raft::grid_1d_warp_t update_grid(
+            (*segment_offsets)[2] - (*segment_offsets)[1],
+            detail::transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size,
+            handle.get_device_properties().maxGridSize[0]);
+          detail::for_all_major_for_all_nbr_mid_degree<adj_matrix_row_key, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              matrix_partition,
+              matrix_partition.get_major_first() + (*segment_offsets)[1],
+              matrix_partition.get_major_first() + (*segment_offsets)[2],
+              adj_matrix_row_value_input_first + row_value_input_offset,
+              adj_matrix_col_value_input_first + col_value_input_offset,
+              adj_matrix_row_col_key_first +
+                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              e_op,
+              tmp_keys.data(),
+              get_dataframe_buffer_begin<T>(tmp_value_buffer));
+        }
+        if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+          raft::grid_1d_thread_t update_grid(
+            (*segment_offsets)[3] - (*segment_offsets)[2],
+            detail::transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size,
+            handle.get_device_properties().maxGridSize[0]);
+          detail::for_all_major_for_all_nbr_low_degree<adj_matrix_row_key, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              matrix_partition,
+              matrix_partition.get_major_first() + (*segment_offsets)[2],
+              matrix_partition.get_major_first() + (*segment_offsets)[3],
+              adj_matrix_row_value_input_first + row_value_input_offset,
+              adj_matrix_col_value_input_first + col_value_input_offset,
+              adj_matrix_row_col_key_first +
+                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              e_op,
+              tmp_keys.data(),
+              get_dataframe_buffer_begin<T>(tmp_value_buffer));
+        }
+        if (matrix_partition.get_dcs_nzd_vertex_count() &&
+            (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
+          raft::grid_1d_thread_t update_grid(
+            *(matrix_partition.get_dcs_nzd_vertex_count()),
+            detail::transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size,
+            handle.get_device_properties().maxGridSize[0]);
+          detail::for_all_major_for_all_nbr_hypersparse<adj_matrix_row_key, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              matrix_partition,
+              matrix_partition.get_major_first() + (*segment_offsets)[3],
+              adj_matrix_row_value_input_first + row_value_input_offset,
+              adj_matrix_col_value_input_first + col_value_input_offset,
+              adj_matrix_row_col_key_first +
+                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              e_op,
+              tmp_keys.data(),
+              get_dataframe_buffer_begin<T>(tmp_value_buffer));
+        }
+      } else {
+        raft::grid_1d_thread_t update_grid(
+          matrix_partition.get_major_size(),
+          detail::transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_size,
+          handle.get_device_properties().maxGridSize[0]);
+
+        detail::for_all_major_for_all_nbr_low_degree<adj_matrix_row_key, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first(),
+            matrix_partition.get_major_last(),
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            adj_matrix_row_col_key_first +
+              (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+            e_op,
+            tmp_keys.data(),
+            get_dataframe_buffer_begin<T>(tmp_value_buffer));
+      }
+    }
+    std::tie(tmp_keys, tmp_value_buffer) = reduce_to_unique_kv_pairs<vertex_t, T>(
+      std::move(tmp_keys), std::move(tmp_value_buffer), handle.get_stream());
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+
+      rmm::device_uvector<vertex_t> rx_unique_keys(0, handle.get_stream());
+      auto rx_value_for_unique_key_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
+      std::tie(rx_unique_keys, rx_value_for_unique_key_buffer, std::ignore) =
+        groupby_gpuid_and_shuffle_kv_pairs(
+          comm,
+          tmp_keys.begin(),
+          tmp_keys.end(),
+          get_dataframe_buffer_begin<T>(tmp_value_buffer),
+          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(
+            auto val) { return key_func(val); },
+          handle.get_stream());
+
+      std::tie(tmp_keys, tmp_value_buffer) = reduce_to_unique_kv_pairs<vertex_t, T>(
+        std::move(rx_unique_keys), std::move(rx_value_for_unique_key_buffer), handle.get_stream());
+    }
+
+    auto cur_size = keys.size();
+    if (cur_size == 0) {
+      keys         = std::move(tmp_keys);
+      value_buffer = std::move(tmp_value_buffer);
+    } else {
+      // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we
+      // can reserve address space to avoid expensive reallocation.
+      // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
+      keys.resize(cur_size + tmp_keys.size(), handle.get_stream());
+      resize_dataframe_buffer<T>(value_buffer, keys.size(), handle.get_stream());
+
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   tmp_keys.begin(),
+                   tmp_keys.end(),
+                   keys.begin() + cur_size);
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   get_dataframe_buffer_begin<T>(tmp_value_buffer),
+                   get_dataframe_buffer_begin<T>(tmp_value_buffer) + tmp_keys.size(),
+                   get_dataframe_buffer_begin<T>(value_buffer) + cur_size);
+    }
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    std::tie(keys, value_buffer) = reduce_to_unique_kv_pairs<vertex_t, T>(
+      std::move(keys), std::move(value_buffer), handle.get_stream());
+  }
+
+  // FIXME: add init
+
+  return std::make_tuple(std::move(keys), std::move(value_buffer));
+}
+
+}  // namespace detail
+
+// FIXME: EdgeOp & VertexOp in update_frontier_v_push_if_out_nbr concatenates push inidicator or
+// bucket idx with the value while EdgeOp here does not. This is inconsistent. Better be fixed.
+/**
+ * @brief Iterate over the entire set of edges and reduce @p edge_op outputs to (key, value) pairs.
+ *
+ * This function is inspired by thrust::transform_reduce() and thrust::reduce_by_key(). Keys for
+ * edges are determined by the graph adjacency matrix rows.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
+ * coincide with vertex type).
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the values in (key, value) pairs.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_row_key_first Iterator pointing to the adjacency matrix row key for the first
+ * (inclusive) column (assigned to this process in multi-GPU). `adj_matrix_row_key_last` (exclusive)
+ * is deduced as @p adj_matrix_row_key_first + @p graph_view.get_number_of_local_adj_matrix_rows().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the value in each transform-reduced (key, value) pair.
+ * @return std::tuple Tuple of rmm::device_uvector<typename GraphView::vertex_type> and
+ * rmm::device_uvector<T> (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if
+ * T is a thrust::tuple type of arithmetic scalar types, one rmm::device_uvector object per scalar
+ * type).
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+auto transform_reduce_by_adj_matrix_row_key_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_key_first,
+  EdgeOp e_op,
+  T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  return detail::transform_reduce_by_adj_matrix_row_col_key_e<true>(
+    handle,
+    graph_view,
+    adj_matrix_row_value_input_first,
+    adj_matrix_col_value_input_first,
+    adj_matrix_row_key_first,
+    e_op,
+    init);
+}
+
+// FIXME: EdgeOp & VertexOp in update_frontier_v_push_if_out_nbr concatenates push inidicator or
+// bucket idx with the value while EdgeOp here does not. This is inconsistent. Better be fixed.
+/**
+ * @brief Iterate over the entire set of edges and reduce @p edge_op outputs to (key, value) pairs.
+ *
+ * This function is inspired by thrust::transform_reduce() and thrust::reduce_by_key(). Keys for
+ * edges are determined by the graph adjacency matrix columns.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
+ * coincide with vertex type).
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the values in (key, value) pairs.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key for the
+ * first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p
+ * graph_view.get_number_of_local_adj_matrix_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the value in each transform-reduced (key, value) pair.
+ * @return std::tuple Tuple of rmm::device_uvector<typename GraphView::vertex_type> and
+ * rmm::device_uvector<T> (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if
+ * T is a thrust::tuple type of arithmetic scalar types, one rmm::device_uvector object per scalar
+ * type).
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+auto transform_reduce_by_adj_matrix_col_key_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_col_key_first,
+  EdgeOp e_op,
+  T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  return detail::transform_reduce_by_adj_matrix_row_col_key_e<false>(
+    handle,
+    graph_view,
+    adj_matrix_row_value_input_first,
+    adj_matrix_col_value_input_first,
+    adj_matrix_col_key_first,
+    e_op,
+    init);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh
new file mode 100644
index 00000000000..5ce40ea20cf
--- /dev/null
+++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/property_op_utils.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/tuple.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr transform_reduce_e_for_all_block_size = 128;
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultIterator,
+          typename EdgeOp>
+__global__ void for_all_major_for_all_nbr_hypersparse(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_hypersparse_first,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultIterator result_iter /* size 1 */,
+  EdgeOp e_op)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = typename std::iterator_traits<ResultIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset =
+    static_cast<size_t>(major_hypersparse_first - matrix_partition.get_major_first());
+  size_t idx = static_cast<size_t>(tid);
+
+  auto dcs_nzd_vertex_count = *(matrix_partition.get_dcs_nzd_vertex_count());
+
+  property_add<e_op_result_t> edge_property_add{};
+  e_op_result_t e_op_result_sum{};
+  while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
+    auto major =
+      *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_idx =
+      major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_idx);
+    auto sum                                    = thrust::transform_reduce(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      [&matrix_partition,
+       &adj_matrix_row_value_input_first,
+       &adj_matrix_col_value_input_first,
+       &e_op,
+       major,
+       indices,
+       weights] __device__(auto i) {
+        auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+        auto minor        = indices[i];
+        auto weight       = weights ? (*weights)[i] : weight_t{1.0};
+        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+        auto row          = GraphViewType::is_adj_matrix_transposed ? minor : major;
+        auto col          = GraphViewType::is_adj_matrix_transposed ? major : minor;
+        auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                                                                 ? minor_offset
+                                                                 : static_cast<vertex_t>(major_offset);
+        auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                                                                 ? static_cast<vertex_t>(major_offset)
+                                                                 : minor_offset;
+        return evaluate_edge_op<GraphViewType,
+                                vertex_t,
+                                AdjMatrixRowValueInputIterator,
+                                AdjMatrixColValueInputIterator,
+                                EdgeOp>()
+          .compute(row,
+                   col,
+                   weight,
+                   *(adj_matrix_row_value_input_first + row_offset),
+                   *(adj_matrix_col_value_input_first + col_offset),
+                   e_op);
+      },
+      e_op_result_t{},
+      edge_property_add);
+
+    e_op_result_sum = edge_property_add(e_op_result_sum, sum);
+    idx += gridDim.x * blockDim.x;
+  }
+
+  e_op_result_sum =
+    block_reduce_edge_op_result<e_op_result_t, transform_reduce_e_for_all_block_size>().compute(
+      e_op_result_sum);
+  if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); }
+}
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultIterator,
+          typename EdgeOp>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultIterator result_iter /* size 1 */,
+  EdgeOp e_op)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = typename std::iterator_traits<ResultIterator>::value_type;
+
+  auto const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(tid);
+
+  property_add<e_op_result_t> edge_property_add{};
+  e_op_result_t e_op_result_sum{};
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
+    auto sum                                    = thrust::transform_reduce(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      [&matrix_partition,
+       &adj_matrix_row_value_input_first,
+       &adj_matrix_col_value_input_first,
+       &e_op,
+       major_offset,
+       indices,
+       weights] __device__(auto i) {
+        auto minor        = indices[i];
+        auto weight       = weights ? (*weights)[i] : weight_t{1.0};
+        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+        auto row          = GraphViewType::is_adj_matrix_transposed
+                                                                 ? minor
+                                                                 : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+        auto col          = GraphViewType::is_adj_matrix_transposed
+                                                                 ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                                                                 : minor;
+        auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                                                                 ? minor_offset
+                                                                 : static_cast<vertex_t>(major_offset);
+        auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                                                                 ? static_cast<vertex_t>(major_offset)
+                                                                 : minor_offset;
+        return evaluate_edge_op<GraphViewType,
+                                vertex_t,
+                                AdjMatrixRowValueInputIterator,
+                                AdjMatrixColValueInputIterator,
+                                EdgeOp>()
+          .compute(row,
+                   col,
+                   weight,
+                   *(adj_matrix_row_value_input_first + row_offset),
+                   *(adj_matrix_col_value_input_first + col_offset),
+                   e_op);
+      },
+      e_op_result_t{},
+      edge_property_add);
+
+    e_op_result_sum = edge_property_add(e_op_result_sum, sum);
+    idx += gridDim.x * blockDim.x;
+  }
+
+  e_op_result_sum =
+    block_reduce_edge_op_result<e_op_result_t, transform_reduce_e_for_all_block_size>().compute(
+      e_op_result_sum);
+  if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); }
+}
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultIterator,
+          typename EdgeOp>
+__global__ void for_all_major_for_all_nbr_mid_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultIterator result_iter /* size 1 */,
+  EdgeOp e_op)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = typename std::iterator_traits<ResultIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(transform_reduce_e_for_all_block_size % raft::warp_size() == 0);
+  auto const lane_id      = tid % raft::warp_size();
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(tid / raft::warp_size());
+
+  property_add<e_op_result_t> edge_property_add{};
+  e_op_result_t e_op_result_sum{};
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+      auto minor        = indices[i];
+      auto weight       = weights ? (*weights)[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                            ? minor
+                            : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+      auto col          = GraphViewType::is_adj_matrix_transposed
+                            ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                            : minor;
+      auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+      auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
+      auto e_op_result  = evaluate_edge_op<GraphViewType,
+                                          vertex_t,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
+    }
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+
+  e_op_result_sum =
+    block_reduce_edge_op_result<e_op_result_t, transform_reduce_e_for_all_block_size>().compute(
+      e_op_result_sum);
+  if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); }
+}
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultIterator,
+          typename EdgeOp>
+__global__ void for_all_major_for_all_nbr_high_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultIterator result_iter /* size 1 */,
+  EdgeOp e_op)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = typename std::iterator_traits<ResultIterator>::value_type;
+
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(blockIdx.x);
+
+  property_add<e_op_result_t> edge_property_add{};
+  e_op_result_t e_op_result_sum{};
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      auto minor        = indices[i];
+      auto weight       = weights ? (*weights)[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                            ? minor
+                            : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+      auto col          = GraphViewType::is_adj_matrix_transposed
+                            ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                            : minor;
+      auto row_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+      auto col_offset   = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
+      auto e_op_result  = evaluate_edge_op<GraphViewType,
+                                          vertex_t,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
+    }
+    idx += gridDim.x;
+  }
+
+  e_op_result_sum =
+    block_reduce_edge_op_result<e_op_result_t, transform_reduce_e_for_all_block_size>().compute(
+      e_op_result_sum);
+  if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Iterate over the entire set of edges and reduce @p edge_op outputs.
+ *
+ * This function is inspired by thrust::transform_reduce().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the transform-reduced input vertex properties.
+ * @return T Reduction of the @p edge_op outputs.
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename T>
+T transform_reduce_e(raft::handle_t const& handle,
+                     GraphViewType const& graph_view,
+                     AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+                     AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                     EdgeOp e_op,
+                     T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  property_add<T> edge_property_add{};
+
+  auto result_buffer = allocate_dataframe_buffer<T>(1, handle.get_stream());
+  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               get_dataframe_buffer_begin<T>(result_buffer),
+               get_dataframe_buffer_begin<T>(result_buffer) + 1,
+               T{});
+
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(i));
+
+    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                    ? vertex_t{0}
+                                    : matrix_partition.get_major_value_start_offset();
+    auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                    ? matrix_partition.get_major_value_start_offset()
+                                    : vertex_t{0};
+    auto segment_offsets        = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+    if (segment_offsets) {
+      // FIXME: we may further improve performance by 1) concurrently running kernels on different
+      // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
+      // segment for very high degree vertices and running segmented reduction
+      static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+      if ((*segment_offsets)[1] > 0) {
+        raft::grid_1d_block_t update_grid((*segment_offsets)[1],
+                                          detail::transform_reduce_e_for_all_block_size,
+                                          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_high_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first(),
+            matrix_partition.get_major_first() + (*segment_offsets)[1],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            get_dataframe_buffer_begin<T>(result_buffer),
+            e_op);
+      }
+      if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+        raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
+                                         detail::transform_reduce_e_for_all_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_mid_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first() + (*segment_offsets)[1],
+            matrix_partition.get_major_first() + (*segment_offsets)[2],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            get_dataframe_buffer_begin<T>(result_buffer),
+            e_op);
+      }
+      if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+        raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
+                                           detail::transform_reduce_e_for_all_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_low_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first() + (*segment_offsets)[2],
+            matrix_partition.get_major_first() + (*segment_offsets)[3],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            get_dataframe_buffer_begin<T>(result_buffer),
+            e_op);
+      }
+      if (matrix_partition.get_dcs_nzd_vertex_count() &&
+          (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
+        raft::grid_1d_thread_t update_grid(*(matrix_partition.get_dcs_nzd_vertex_count()),
+                                           detail::transform_reduce_e_for_all_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_hypersparse<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first() + (*segment_offsets)[3],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            get_dataframe_buffer_begin<T>(result_buffer),
+            e_op);
+      }
+    } else {
+      if (matrix_partition.get_major_size() > 0) {
+        raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
+                                           detail::transform_reduce_e_for_all_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+
+        detail::for_all_major_for_all_nbr_low_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first(),
+            matrix_partition.get_major_last(),
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first + col_value_input_offset,
+            get_dataframe_buffer_begin<T>(result_buffer),
+            e_op);
+      }
+    }
+  }
+
+  auto result = thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                               get_dataframe_buffer_begin<T>(result_buffer),
+                               get_dataframe_buffer_begin<T>(result_buffer) + 1,
+                               T{},
+                               edge_property_add);
+
+  if (GraphViewType::is_multi_gpu) {
+    result = host_scalar_allreduce(handle.get_comms(), result, handle.get_stream());
+  }
+
+  return edge_property_add(init, result);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_v.cuh b/cpp/include/cugraph/prims/transform_reduce_v.cuh
similarity index 96%
rename from cpp/include/patterns/transform_reduce_v.cuh
rename to cpp/include/cugraph/prims/transform_reduce_v.cuh
index 02538c36f47..0d5b4f9cbb6 100644
--- a/cpp/include/patterns/transform_reduce_v.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
new file mode 100644
index 00000000000..e2f72c66d0b
--- /dev/null
+++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/prims/property_op_utils.cuh>
+#include <cugraph/prims/reduce_op.cuh>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_barrier.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+#include <cugraph/utilities/shuffle_comm.cuh>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <cub/cub.cuh>
+
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+int32_t constexpr update_frontier_v_push_if_out_nbr_for_all_block_size = 512;
+
+// we cannot use std::iterator_traits<Iterator>::value_type if Iterator is void* (reference to void
+// is not allowed)
+template <typename PayloadIterator, typename Enable = void>
+struct optional_payload_buffer_value_type_t;
+
+template <typename PayloadIterator>
+struct optional_payload_buffer_value_type_t<
+  PayloadIterator,
+  std::enable_if_t<!std::is_same_v<PayloadIterator, void*>>> {
+  using value = typename std::iterator_traits<PayloadIterator>::value_type;
+};
+
+template <typename PayloadIterator>
+struct optional_payload_buffer_value_type_t<
+  PayloadIterator,
+  std::enable_if_t<std::is_same_v<PayloadIterator, void*>>> {
+  using value = void;
+};
+
+// FIXME: to silence the spurious warning (missing return statement ...) due to the nvcc bug
+// (https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-
+// function-in-constexpr-if-fun)
+#if 1
+template <typename payload_t, std::enable_if_t<std::is_same_v<payload_t, void>>* = nullptr>
+std::byte allocate_optional_payload_buffer(size_t size, cudaStream_t stream)
+{
+  return std::byte{0};  // dummy
+}
+
+template <typename payload_t, std::enable_if_t<!std::is_same_v<payload_t, void>>* = nullptr>
+auto allocate_optional_payload_buffer(size_t size, cudaStream_t stream)
+{
+  return allocate_dataframe_buffer<payload_t>(size, stream);
+}
+
+template <typename payload_t, std::enable_if_t<std::is_same_v<payload_t, void>>* = nullptr>
+void* get_optional_payload_buffer_begin(std::byte& optional_payload_buffer)
+{
+  return static_cast<void*>(nullptr);
+}
+
+template <typename payload_t, std::enable_if_t<!std::is_same_v<payload_t, void>>* = nullptr>
+auto get_optional_payload_buffer_begin(
+  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<payload_t>(
+    size_t{0}, cudaStream_t{nullptr}))> optional_payload_buffer)
+{
+  return get_dataframe_buffer_begin<payload_t>(optional_payload_buffer);
+}
+#else
+auto allocate_optional_payload_buffer = [](size_t size, cudaStream_t stream) {
+  if constexpr (std::is_same_v<payload_t, void>) {
+    return std::byte{0};  // dummy
+  } else {
+    return allocate_dataframe_buffer<payload_t>(size, stream);
+  }
+};
+
+auto get_optional_payload_buffer_begin = [](auto& optional_payload_buffer) {
+  if constexpr (std::is_same_v<payload_t, void>) {
+    return static_cast<std::byte*>(nullptr);
+  } else {
+    return get_dataframe_buffer_begin<payload_t>(optional_payload_buffer);
+  }
+};
+#endif
+
+// FIXME: a temporary workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// in the else part in if constexpr else statement that involves device lambda
+template <typename vertex_t,
+          typename VertexValueInputIterator,
+          typename VertexValueOutputIterator,
+          typename VertexOp,
+          typename key_t,
+          bool multi_gpu>
+struct call_v_op_t {
+  VertexValueInputIterator vertex_value_input_first{};
+  VertexValueOutputIterator vertex_value_output_first{};
+  VertexOp v_op{};
+  vertex_partition_device_view_t<vertex_t, multi_gpu> vertex_partition{};
+  size_t invalid_bucket_idx;
+
+  template <typename key_type = key_t, typename vertex_type = vertex_t>
+  __device__ std::enable_if_t<std::is_same_v<key_type, vertex_type>, uint8_t> operator()(
+    key_t key) const
+  {
+    auto v_offset    = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key);
+    auto v_val       = *(vertex_value_input_first + v_offset);
+    auto v_op_result = v_op(key, v_val);
+    if (v_op_result) {
+      *(vertex_value_output_first + v_offset) = thrust::get<1>(*v_op_result);
+      return static_cast<uint8_t>(thrust::get<0>(*v_op_result));
+    } else {
+      return std::numeric_limits<uint8_t>::max();
+    }
+  }
+
+  template <typename key_type = key_t, typename vertex_type = vertex_t>
+  __device__ std::enable_if_t<!std::is_same_v<key_type, vertex_type>, uint8_t> operator()(
+    key_t key) const
+  {
+    auto v_offset =
+      vertex_partition.get_local_vertex_offset_from_vertex_nocheck(thrust::get<0>(key));
+    auto v_val       = *(vertex_value_input_first + v_offset);
+    auto v_op_result = v_op(key, v_val);
+    if (v_op_result) {
+      *(vertex_value_output_first + v_offset) = thrust::get<1>(*v_op_result);
+      return static_cast<uint8_t>(thrust::get<0>(*v_op_result));
+    } else {
+      return std::numeric_limits<uint8_t>::max();
+    }
+  }
+};
+
+// FIXME: a temporary workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// after if constexpr else statement that involves device lambda (bug report submitted)
+template <typename key_t>
+struct check_invalid_bucket_idx_t {
+  __device__ bool operator()(thrust::tuple<uint8_t, key_t> pair)
+  {
+    return thrust::get<0>(pair) == std::numeric_limits<uint8_t>::max();
+  }
+};
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BufferKeyOutputIterator,
+          typename BufferPayloadOutputIterator,
+          typename EdgeOp>
+__device__ void push_if_buffer_element(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu>& matrix_partition,
+  typename std::iterator_traits<BufferKeyOutputIterator>::value_type key,
+  typename GraphViewType::vertex_type row_offset,
+  typename GraphViewType::vertex_type col,
+  typename GraphViewType::weight_type weight,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferPayloadOutputIterator buffer_payload_output_first,
+  size_t* buffer_idx_ptr,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using key_t    = typename std::iterator_traits<BufferKeyOutputIterator>::value_type;
+  using payload_t =
+    typename optional_payload_buffer_value_type_t<BufferPayloadOutputIterator>::value;
+
+  auto col_offset  = matrix_partition.get_minor_offset_from_minor_nocheck(col);
+  auto e_op_result = evaluate_edge_op<GraphViewType,
+                                      key_t,
+                                      AdjMatrixRowValueInputIterator,
+                                      AdjMatrixColValueInputIterator,
+                                      EdgeOp>()
+                       .compute(key,
+                                col,
+                                weight,
+                                *(adj_matrix_row_value_input_first + row_offset),
+                                *(adj_matrix_col_value_input_first + col_offset),
+                                e_op);
+  if (e_op_result) {
+    static_assert(sizeof(unsigned long long int) == sizeof(size_t));
+    auto buffer_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
+                                static_cast<unsigned long long int>(1));
+    if constexpr (std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
+      *(buffer_key_output_first + buffer_idx) = col;
+    } else if constexpr (std::is_same_v<key_t, vertex_t> && !std::is_same_v<payload_t, void>) {
+      *(buffer_key_output_first + buffer_idx)     = col;
+      *(buffer_payload_output_first + buffer_idx) = *e_op_result;
+    } else if constexpr (!std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
+      *(buffer_key_output_first + buffer_idx) = thrust::make_tuple(col, *e_op_result);
+    } else {
+      *(buffer_key_output_first + buffer_idx) =
+        thrust::make_tuple(col, thrust::get<0>(*e_op_result));
+      *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(*e_op_result);
+    }
+  }
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BufferKeyOutputIterator,
+          typename BufferPayloadOutputIterator,
+          typename EdgeOp>
+__global__ void for_all_frontier_row_for_all_nbr_hypersparse(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  typename GraphViewType::vertex_type major_hypersparse_first,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferPayloadOutputIterator buffer_payload_output_first,
+  size_t* buffer_idx_ptr,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using key_t    = typename std::iterator_traits<KeyIterator>::value_type;
+  static_assert(
+    std::is_same_v<key_t, typename std::iterator_traits<BufferKeyOutputIterator>::value_type>);
+  using payload_t =
+    typename optional_payload_buffer_value_type_t<BufferPayloadOutputIterator>::value;
+
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto row_start_offset =
+    static_cast<size_t>(major_hypersparse_first - matrix_partition.get_major_first());
+  auto idx = static_cast<size_t>(tid);
+
+  auto dcs_nzd_vertices     = *(matrix_partition.get_dcs_nzd_vertices());
+  auto dcs_nzd_vertex_count = *(matrix_partition.get_dcs_nzd_vertex_count());
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key = *(key_first + idx);
+    vertex_t row{};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      row = key;
+    } else {
+      row = thrust::get<0>(key);
+    }
+    auto row_hypersparse_idx = matrix_partition.get_major_hypersparse_idx_from_major_nocheck(row);
+    if (row_hypersparse_idx) {
+      auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+      auto row_idx    = row_start_offset + *row_hypersparse_idx;
+      vertex_t const* indices{nullptr};
+      thrust::optional<weight_t const*> weights{nullptr};
+      edge_t local_out_degree{};
+      thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_idx);
+      for (edge_t i = 0; i < local_out_degree; ++i) {
+        push_if_buffer_element<GraphViewType>(matrix_partition,
+                                              key,
+                                              row_offset,
+                                              indices[i],
+                                              weights ? (*weights)[i] : weight_t{1.0},
+                                              adj_matrix_row_value_input_first,
+                                              adj_matrix_col_value_input_first,
+                                              buffer_key_output_first,
+                                              buffer_payload_output_first,
+                                              buffer_idx_ptr,
+                                              e_op);
+      }
+    }
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BufferKeyOutputIterator,
+          typename BufferPayloadOutputIterator,
+          typename EdgeOp>
+__global__ void for_all_frontier_row_for_all_nbr_low_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferPayloadOutputIterator buffer_payload_output_first,
+  size_t* buffer_idx_ptr,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using key_t    = typename std::iterator_traits<KeyIterator>::value_type;
+  static_assert(
+    std::is_same_v<key_t, typename std::iterator_traits<BufferKeyOutputIterator>::value_type>);
+  using payload_t =
+    typename optional_payload_buffer_value_type_t<BufferPayloadOutputIterator>::value;
+
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto idx       = static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key = *(key_first + idx);
+    vertex_t row{};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      row = key;
+    } else {
+      row = thrust::get<0>(key);
+    }
+    auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_out_degree{};
+    thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset);
+    for (edge_t i = 0; i < local_out_degree; ++i) {
+      push_if_buffer_element<GraphViewType>(matrix_partition,
+                                            key,
+                                            row_offset,
+                                            indices[i],
+                                            weights ? (*weights)[i] : weight_t{1.0},
+                                            adj_matrix_row_value_input_first,
+                                            adj_matrix_col_value_input_first,
+                                            buffer_key_output_first,
+                                            buffer_payload_output_first,
+                                            buffer_idx_ptr,
+                                            e_op);
+    }
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BufferKeyOutputIterator,
+          typename BufferPayloadOutputIterator,
+          typename EdgeOp>
+__global__ void for_all_frontier_row_for_all_nbr_mid_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferPayloadOutputIterator buffer_payload_output_first,
+  size_t* buffer_idx_ptr,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using key_t    = typename std::iterator_traits<KeyIterator>::value_type;
+  static_assert(
+    std::is_same_v<key_t, typename std::iterator_traits<BufferKeyOutputIterator>::value_type>);
+  using payload_t =
+    typename optional_payload_buffer_value_type_t<BufferPayloadOutputIterator>::value;
+
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(update_frontier_v_push_if_out_nbr_for_all_block_size % raft::warp_size() == 0);
+  auto const lane_id = tid % raft::warp_size();
+  auto idx           = static_cast<size_t>(tid / raft::warp_size());
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key = *(key_first + idx);
+    vertex_t row{};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      row = key;
+    } else {
+      row = thrust::get<0>(key);
+    }
+    auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_out_degree{};
+    thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset);
+    for (edge_t i = lane_id; i < local_out_degree; i += raft::warp_size()) {
+      push_if_buffer_element<GraphViewType>(matrix_partition,
+                                            key,
+                                            row_offset,
+                                            indices[i],
+                                            weights ? (*weights)[i] : weight_t{1.0},
+                                            adj_matrix_row_value_input_first,
+                                            adj_matrix_col_value_input_first,
+                                            buffer_key_output_first,
+                                            buffer_payload_output_first,
+                                            buffer_idx_ptr,
+                                            e_op);
+    }
+
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BufferKeyOutputIterator,
+          typename BufferPayloadOutputIterator,
+          typename EdgeOp>
+__global__ void for_all_frontier_row_for_all_nbr_high_degree(
+  matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                 typename GraphViewType::edge_type,
+                                 typename GraphViewType::weight_type,
+                                 GraphViewType::is_multi_gpu> matrix_partition,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferPayloadOutputIterator buffer_payload_output_first,
+  size_t* buffer_idx_ptr,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using key_t    = typename std::iterator_traits<KeyIterator>::value_type;
+  static_assert(
+    std::is_same_v<key_t, typename std::iterator_traits<BufferKeyOutputIterator>::value_type>);
+  using payload_t =
+    typename optional_payload_buffer_value_type_t<BufferPayloadOutputIterator>::value;
+
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto idx = static_cast<size_t>(blockIdx.x);
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key = *(key_first + idx);
+    vertex_t row{};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      row = key;
+    } else {
+      row = thrust::get<0>(key);
+    }
+    auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_out_degree{};
+    thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset);
+    for (edge_t i = threadIdx.x; i < local_out_degree; i += blockDim.x) {
+      push_if_buffer_element<GraphViewType>(matrix_partition,
+                                            key,
+                                            row_offset,
+                                            indices[i],
+                                            weights ? (*weights)[i] : weight_t{1.0},
+                                            adj_matrix_row_value_input_first,
+                                            adj_matrix_col_value_input_first,
+                                            buffer_key_output_first,
+                                            buffer_payload_output_first,
+                                            buffer_idx_ptr,
+                                            e_op);
+    }
+
+    idx += gridDim.x;
+  }
+}
+
+template <typename BufferKeyOutputIterator, typename BufferPayloadOutputIterator, typename ReduceOp>
+size_t sort_and_reduce_buffer_elements(raft::handle_t const& handle,
+                                       BufferKeyOutputIterator buffer_key_output_first,
+                                       BufferPayloadOutputIterator buffer_payload_output_first,
+                                       size_t num_buffer_elements,
+                                       ReduceOp reduce_op)
+{
+  using key_t = typename std::iterator_traits<BufferKeyOutputIterator>::value_type;
+  using payload_t =
+    typename optional_payload_buffer_value_type_t<BufferPayloadOutputIterator>::value;
+
+  if constexpr (std::is_same_v<payload_t, void>) {
+    thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 buffer_key_output_first,
+                 buffer_key_output_first + num_buffer_elements);
+  } else {
+    thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        buffer_key_output_first,
+                        buffer_key_output_first + num_buffer_elements,
+                        buffer_payload_output_first);
+  }
+
+  size_t num_reduced_buffer_elements{};
+  if constexpr (std::is_same_v<payload_t, void>) {
+    auto it = thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                             buffer_key_output_first,
+                             buffer_key_output_first + num_buffer_elements);
+    num_reduced_buffer_elements =
+      static_cast<size_t>(thrust::distance(buffer_key_output_first, it));
+  } else if constexpr (std::is_same<ReduceOp, reduce_op::any<typename ReduceOp::type>>::value) {
+    // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard
+    // non-first elements)
+    auto it = thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    buffer_key_output_first,
+                                    buffer_key_output_first + num_buffer_elements,
+                                    buffer_payload_output_first);
+    num_reduced_buffer_elements =
+      static_cast<size_t>(thrust::distance(buffer_key_output_first, thrust::get<0>(it)));
+  } else {
+    // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt
+    // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we
+    // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and
+    // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the
+    // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the
+    // system HBM size or a function of the maximum number of threads in the system))
+    // FIXME: actually, we can find how many unique keys are here by now.
+    // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding
+    // the vertex unless reduce_op is a pure function.
+    rmm::device_uvector<key_t> keys(num_buffer_elements, handle.get_stream());
+    auto value_buffer =
+      allocate_dataframe_buffer<payload_t>(num_buffer_elements, handle.get_stream());
+    auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    buffer_key_output_first,
+                                    buffer_key_output_first + num_buffer_elements,
+                                    buffer_payload_output_first,
+                                    keys.begin(),
+                                    get_dataframe_buffer_begin<payload_t>(value_buffer),
+                                    thrust::equal_to<key_t>(),
+                                    reduce_op);
+    num_reduced_buffer_elements =
+      static_cast<size_t>(thrust::distance(keys.begin(), thrust::get<0>(it)));
+    // FIXME: this copy can be replaced by move
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 keys.begin(),
+                 keys.begin() + num_reduced_buffer_elements,
+                 buffer_key_output_first);
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 get_dataframe_buffer_begin<payload_t>(value_buffer),
+                 get_dataframe_buffer_begin<payload_t>(value_buffer) + num_reduced_buffer_elements,
+                 buffer_payload_output_first);
+  }
+
+  return num_reduced_buffer_elements;
+}
+
+}  // namespace detail
+
+template <typename GraphViewType, typename VertexFrontierType>
+typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexFrontierType const& frontier,
+  size_t cur_frontier_bucket_idx)
+{
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using key_t    = typename VertexFrontierType::key_type;
+
+  edge_t ret{0};
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  auto const& cur_frontier_bucket = frontier.get_bucket(cur_frontier_bucket_idx);
+  vertex_t const* local_frontier_vertex_first{nullptr};
+  vertex_t const* local_frontier_vertex_last{nullptr};
+  if constexpr (std::is_same_v<key_t, vertex_t>) {
+    local_frontier_vertex_first = cur_frontier_bucket.begin();
+    local_frontier_vertex_last  = cur_frontier_bucket.end();
+  } else {
+    local_frontier_vertex_first = thrust::get<0>(cur_frontier_bucket.begin().get_iterator_tuple());
+    local_frontier_vertex_last  = thrust::get<0>(cur_frontier_bucket.end().get_iterator_tuple());
+  }
+
+  std::vector<size_t> local_frontier_sizes{};
+  if (GraphViewType::is_multi_gpu) {
+    auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    local_frontier_sizes =
+      host_scalar_allgather(col_comm, cur_frontier_bucket.size(), handle.get_stream());
+  } else {
+    local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(cur_frontier_bucket.size())};
+  }
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(i));
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+
+      rmm::device_uvector<vertex_t> frontier_vertices(local_frontier_sizes[i],
+                                                      handle.get_stream_view());
+      // FIXME: this copy is unnecessary, better fix RAFT comm's bcast to take const iterators for
+      // input
+      if (col_comm_rank == static_cast<int>(i)) {
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     local_frontier_vertex_first,
+                     local_frontier_vertex_last,
+                     frontier_vertices.begin());
+      }
+      device_bcast(col_comm,
+                   frontier_vertices.data(),
+                   frontier_vertices.data(),
+                   frontier_vertices.size(),
+                   static_cast<int>(i),
+                   handle.get_stream());
+
+      auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+      auto use_dcs =
+        segment_offsets
+          ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
+          : false;
+
+      ret +=
+        use_dcs
+          ? thrust::transform_reduce(
+              rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+              frontier_vertices.begin(),
+              frontier_vertices.end(),
+              [matrix_partition,
+               major_hypersparse_first =
+                 matrix_partition.get_major_first() +
+                 (*segment_offsets)
+                   [detail::num_sparse_segments_per_vertex_partition]] __device__(auto major) {
+                if (major < major_hypersparse_first) {
+                  auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+                  return matrix_partition.get_local_degree(major_offset);
+                } else {
+                  auto major_hypersparse_idx =
+                    matrix_partition.get_major_hypersparse_idx_from_major_nocheck(major);
+                  return major_hypersparse_idx
+                           ? matrix_partition.get_local_degree(
+                               matrix_partition.get_major_offset_from_major_nocheck(
+                                 major_hypersparse_first) +
+                               *major_hypersparse_idx)
+                           : edge_t{0};
+                }
+              },
+              edge_t{0},
+              thrust::plus<edge_t>())
+          : thrust::transform_reduce(
+              rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+              frontier_vertices.begin(),
+              frontier_vertices.end(),
+              [matrix_partition] __device__(auto major) {
+                auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+                return matrix_partition.get_local_degree(major_offset);
+              },
+              edge_t{0},
+              thrust::plus<edge_t>());
+    } else {
+      assert(i == 0);
+      ret += thrust::transform_reduce(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        local_frontier_vertex_first,
+        local_frontier_vertex_last,
+        [matrix_partition] __device__(auto major) {
+          auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+          return matrix_partition.get_local_degree(major_offset);
+        },
+        edge_t{0},
+        thrust::plus<edge_t>());
+    }
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  return ret;
+}
+
+// FIXME: this documentation needs to be updated due to (tagged-)vertex support
+/**
+ * @brief Update (tagged-)vertex frontier and (tagged-)vertex property values iterating over the
+ * outgoing edges from the frontier.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexFrontierType Type of the vertex frontier class which abstracts vertex frontier
+ * managements.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex property variables.
+ * @tparam VertexOp Type of the binary vertex operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param frontier VertexFrontier class object for vertex frontier managements. This object includes
+ * multiple bucket objects.
+ * @param cur_frontier_bucket_idx Index of the VertexFrontier bucket holding vertices for the
+ * current iteration.
+ * @param next_frontier_bucket_indices Indices of the VertexFrontier buckets to store new frontier
+ * vertices for the next iteration.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a value to reduced by the @p
+ * reduce_op.
+ * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
+ * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.get_number_of_local_vertices().
+ * @param v_op Ternary operator takes (tagged-)vertex ID, *(@p vertex_value_input_first + i) (where
+ * i is [0, @p graph_view.get_number_of_local_vertices())) and reduced value of the @p e_op outputs
+ * for this vertex and returns the target bucket index (for frontier update) and new verrtex
+ * property values (to update *(@p vertex_value_output_first + i)). The target bucket index should
+ * either be VertexFrontierType::kInvalidBucketIdx or an index in @p next_frontier_bucket_indices.
+ */
+template <typename GraphViewType,
+          typename VertexFrontierType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename VertexValueInputIterator,
+          typename VertexValueOutputIterator,
+          typename VertexOp>
+void update_frontier_v_push_if_out_nbr(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexFrontierType& frontier,
+  size_t cur_frontier_bucket_idx,
+  std::vector<size_t> const& next_frontier_bucket_indices,
+  // FIXME: if vertices in the frontier are tagged, we should have an option to access with (vertex,
+  // tag) pair (currently we can access only with vertex, we may use cuco::static_map for this
+  // purpose)
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  EdgeOp e_op,
+  ReduceOp reduce_op,
+  // FIXME: if vertices in the frontier are tagged, we should have an option to access with (vertex,
+  // tag) pair (currently we can access only with vertex, we may use cuco::static_map for this
+  // purpose)
+  VertexValueInputIterator vertex_value_input_first,
+  // FIXME: if vertices in the frontier are tagged, we should have an option to access with (vertex,
+  // tag) pair (currently we can access only with vertex, we may use cuco::static_map for this
+  // purpose)
+  // FIXME: currently, it is undefined behavior if vertices in the frontier are tagged and the same
+  // vertex property is updated by multiple v_op invocations with the same vertex but with different
+  // tags.
+  VertexValueOutputIterator vertex_value_output_first,
+  // FIXME: this takes (tagged-)vertex ID in addition, think about consistency with the other
+  // primitives.
+  VertexOp v_op)
+{
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  using vertex_t  = typename GraphViewType::vertex_type;
+  using edge_t    = typename GraphViewType::edge_type;
+  using weight_t  = typename GraphViewType::weight_type;
+  using key_t     = typename VertexFrontierType::key_type;
+  using payload_t = typename ReduceOp::type;
+
+  auto frontier_key_first = frontier.get_bucket(cur_frontier_bucket_idx).begin();
+  auto frontier_key_last  = frontier.get_bucket(cur_frontier_bucket_idx).end();
+
+  // 1. fill the buffer
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  auto key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+  auto payload_buffer =
+    detail::allocate_optional_payload_buffer<payload_t>(size_t{0}, handle.get_stream());
+  rmm::device_scalar<size_t> buffer_idx(size_t{0}, handle.get_stream());
+  std::vector<size_t> local_frontier_sizes{};
+  if (GraphViewType::is_multi_gpu) {
+    auto& col_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    local_frontier_sizes = host_scalar_allgather(
+      col_comm,
+      static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
+      handle.get_stream());
+  } else {
+    local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(
+      static_cast<vertex_t>(thrust::distance(frontier_key_first, frontier_key_last)))};
+  }
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto matrix_partition =
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.get_matrix_partition_view(i));
+
+    auto matrix_partition_frontier_key_buffer =
+      allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+    vertex_t matrix_partition_frontier_size = static_cast<vertex_t>(local_frontier_sizes[i]);
+    if (GraphViewType::is_multi_gpu) {
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+
+      resize_dataframe_buffer<key_t>(
+        matrix_partition_frontier_key_buffer, matrix_partition_frontier_size, handle.get_stream());
+
+      if (static_cast<size_t>(col_comm_rank) == i) {
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     frontier_key_first,
+                     frontier_key_last,
+                     get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer));
+      }
+
+      device_bcast(col_comm,
+                   frontier_key_first,
+                   get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer),
+                   matrix_partition_frontier_size,
+                   i,
+                   handle.get_stream());
+    } else {
+      resize_dataframe_buffer<key_t>(
+        matrix_partition_frontier_key_buffer, matrix_partition_frontier_size, handle.get_stream());
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   frontier_key_first,
+                   frontier_key_last,
+                   get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer));
+    }
+
+    vertex_t const* matrix_partition_frontier_row_first{nullptr};
+    vertex_t const* matrix_partition_frontier_row_last{nullptr};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      matrix_partition_frontier_row_first =
+        get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer);
+      matrix_partition_frontier_row_last =
+        get_dataframe_buffer_end<key_t>(matrix_partition_frontier_key_buffer);
+    } else {
+      matrix_partition_frontier_row_first =
+        thrust::get<0>(get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer)
+                         .get_iterator_tuple());
+      matrix_partition_frontier_row_last = thrust::get<0>(
+        get_dataframe_buffer_end<key_t>(matrix_partition_frontier_key_buffer).get_iterator_tuple());
+    }
+
+    auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+    auto use_dcs =
+      segment_offsets
+        ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
+        : false;
+
+    auto max_pushes =
+      use_dcs ? thrust::transform_reduce(
+                  rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                  matrix_partition_frontier_row_first,
+                  matrix_partition_frontier_row_last,
+                  [matrix_partition,
+                   major_hypersparse_first =
+                     matrix_partition.get_major_first() +
+                     (*segment_offsets)
+                       [detail::num_sparse_segments_per_vertex_partition]] __device__(auto row) {
+                    if (row < major_hypersparse_first) {
+                      auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+                      return matrix_partition.get_local_degree(row_offset);
+                    } else {
+                      auto row_hypersparse_idx =
+                        matrix_partition.get_major_hypersparse_idx_from_major_nocheck(row);
+                      return row_hypersparse_idx
+                               ? matrix_partition.get_local_degree(
+                                   matrix_partition.get_major_offset_from_major_nocheck(
+                                     major_hypersparse_first) +
+                                   *row_hypersparse_idx)
+                               : edge_t{0};
+                    }
+                  },
+                  edge_t{0},
+                  thrust::plus<edge_t>())
+              : thrust::transform_reduce(
+                  rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                  matrix_partition_frontier_row_first,
+                  matrix_partition_frontier_row_last,
+                  [matrix_partition] __device__(auto row) {
+                    auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+                    return matrix_partition.get_local_degree(row_offset);
+                  },
+                  edge_t{0},
+                  thrust::plus<edge_t>());
+
+    // FIXME: This is highly pessimistic for single GPU (and multi-GPU as well if we maintain
+    // additional per column data for filtering in e_op). If we can pause & resume execution if
+    // buffer needs to be increased (and if we reserve address space to avoid expensive
+    // reallocation;
+    // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management/), we can
+    // start with a smaller buffer size (especially when the frontier size is large).
+    // for special cases when we can assure that there is no more than one push per destination
+    // (e.g. if cugraph::experimental::reduce_op::any is used), we can limit the buffer size to
+    // std::min(max_pushes, matrix_partition.get_minor_size()).
+    // For Volta+, we can limit the buffer size to std::min(max_pushes,
+    // matrix_partition.get_minor_size()) if the reduction operation is a pure function if we use
+    // locking.
+    // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature
+    // to reserve address space.
+    auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes;
+    resize_dataframe_buffer<key_t>(key_buffer, new_buffer_size, handle.get_stream());
+    if constexpr (!std::is_same_v<payload_t, void>) {
+      resize_dataframe_buffer<payload_t>(payload_buffer, new_buffer_size, handle.get_stream());
+    }
+
+    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                    ? vertex_t{0}
+                                    : matrix_partition.get_major_value_start_offset();
+    if (segment_offsets) {
+      static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+      std::vector<vertex_t> h_thresholds(detail::num_sparse_segments_per_vertex_partition +
+                                         (use_dcs ? 1 : 0) - 1);
+      h_thresholds[0] = matrix_partition.get_major_first() + (*segment_offsets)[1];
+      h_thresholds[1] = matrix_partition.get_major_first() + (*segment_offsets)[2];
+      if (use_dcs) { h_thresholds[2] = matrix_partition.get_major_first() + (*segment_offsets)[3]; }
+      rmm::device_uvector<vertex_t> d_thresholds(h_thresholds.size(), handle.get_stream());
+      raft::update_device(
+        d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> d_offsets(d_thresholds.size(), handle.get_stream());
+      thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          matrix_partition_frontier_row_first,
+                          matrix_partition_frontier_row_last,
+                          d_thresholds.begin(),
+                          d_thresholds.end(),
+                          d_offsets.begin());
+      std::vector<vertex_t> h_offsets(d_offsets.size());
+      raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream());
+      CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+      h_offsets.push_back(matrix_partition_frontier_size);
+      // FIXME: we may further improve performance by 1) concurrently running kernels on different
+      // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
+      // segment for very high degree vertices and running segmented reduction
+      if (h_offsets[0] > 0) {
+        raft::grid_1d_block_t update_grid(
+          h_offsets[0],
+          detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
+          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_frontier_row_for_all_nbr_high_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer),
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[0],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first,
+            get_dataframe_buffer_begin<key_t>(key_buffer),
+            detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+            buffer_idx.data(),
+            e_op);
+      }
+      if (h_offsets[1] - h_offsets[0] > 0) {
+        raft::grid_1d_warp_t update_grid(
+          h_offsets[1] - h_offsets[0],
+          detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
+          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_frontier_row_for_all_nbr_mid_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[0],
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[1],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first,
+            get_dataframe_buffer_begin<key_t>(key_buffer),
+            detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+            buffer_idx.data(),
+            e_op);
+      }
+      if (h_offsets[2] - h_offsets[1] > 0) {
+        raft::grid_1d_thread_t update_grid(
+          h_offsets[2] - h_offsets[1],
+          detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
+          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_frontier_row_for_all_nbr_low_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[1],
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[2],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first,
+            get_dataframe_buffer_begin<key_t>(key_buffer),
+            detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+            buffer_idx.data(),
+            e_op);
+      }
+      if (matrix_partition.get_dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) {
+        raft::grid_1d_thread_t update_grid(
+          h_offsets[3] - h_offsets[2],
+          detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
+          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_frontier_row_for_all_nbr_hypersparse<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            matrix_partition.get_major_first() + (*segment_offsets)[3],
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[2],
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[3],
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first,
+            get_dataframe_buffer_begin<key_t>(key_buffer),
+            detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+            buffer_idx.data(),
+            e_op);
+      }
+    } else {
+      if (matrix_partition_frontier_size > 0) {
+        raft::grid_1d_thread_t update_grid(
+          matrix_partition_frontier_size,
+          detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
+          handle.get_device_properties().maxGridSize[0]);
+
+        detail::for_all_frontier_row_for_all_nbr_low_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            matrix_partition,
+            get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer),
+            get_dataframe_buffer_end<key_t>(matrix_partition_frontier_key_buffer),
+            adj_matrix_row_value_input_first + row_value_input_offset,
+            adj_matrix_col_value_input_first,
+            get_dataframe_buffer_begin<key_t>(key_buffer),
+            detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+            buffer_idx.data(),
+            e_op);
+      }
+    }
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  // 2. reduce the buffer
+
+  auto num_buffer_elements = detail::sort_and_reduce_buffer_elements(
+    handle,
+    get_dataframe_buffer_begin<key_t>(key_buffer),
+    detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+    buffer_idx.value(handle.get_stream()),
+    reduce_op);
+  if (GraphViewType::is_multi_gpu) {
+    // FIXME: this step is unnecessary if row_comm_size== 1
+    auto& comm               = handle.get_comms();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    std::vector<vertex_t> h_vertex_lasts(row_comm_size);
+    for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
+      h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i);
+    }
+
+    rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
+    raft::update_device(
+      d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
+    rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
+                                                            handle.get_stream());
+    vertex_t const* row_first{nullptr};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      row_first = get_dataframe_buffer_begin<key_t>(key_buffer);
+    } else {
+      row_first =
+        thrust::get<0>(get_dataframe_buffer_begin<key_t>(key_buffer).get_iterator_tuple());
+    }
+    thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        row_first,
+                        row_first + num_buffer_elements,
+                        d_vertex_lasts.begin(),
+                        d_vertex_lasts.end(),
+                        d_tx_buffer_last_boundaries.begin());
+    std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
+    raft::update_host(h_tx_buffer_last_boundaries.data(),
+                      d_tx_buffer_last_boundaries.data(),
+                      d_tx_buffer_last_boundaries.size(),
+                      handle.get_stream());
+    handle.get_stream_view().synchronize();
+    std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
+    std::adjacent_difference(
+      h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
+
+    auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+    std::tie(rx_key_buffer, std::ignore) = shuffle_values(
+      row_comm, get_dataframe_buffer_begin<key_t>(key_buffer), tx_counts, handle.get_stream());
+    key_buffer = std::move(rx_key_buffer);
+
+    if constexpr (!std::is_same_v<payload_t, void>) {
+      auto rx_payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
+      std::tie(rx_payload_buffer, std::ignore) =
+        shuffle_values(row_comm,
+                       get_dataframe_buffer_begin<payload_t>(payload_buffer),
+                       tx_counts,
+                       handle.get_stream());
+      payload_buffer = std::move(rx_payload_buffer);
+    }
+
+    num_buffer_elements = detail::sort_and_reduce_buffer_elements(
+      handle,
+      get_dataframe_buffer_begin<key_t>(key_buffer),
+      detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
+      size_dataframe_buffer<key_t>(key_buffer),
+      reduce_op);
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  // 3. update vertex properties and frontier
+
+  if (num_buffer_elements > 0) {
+    static_assert(VertexFrontierType::kNumBuckets <= std::numeric_limits<uint8_t>::max());
+    rmm::device_uvector<uint8_t> bucket_indices(num_buffer_elements, handle.get_stream());
+
+    auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+      graph_view.get_vertex_partition_view());
+
+    if constexpr (!std::is_same_v<payload_t, void>) {
+      auto key_payload_pair_first = thrust::make_zip_iterator(
+        thrust::make_tuple(get_dataframe_buffer_begin<key_t>(key_buffer),
+                           detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer)));
+      thrust::transform(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        key_payload_pair_first,
+        key_payload_pair_first + num_buffer_elements,
+        bucket_indices.begin(),
+        [vertex_value_input_first,
+         vertex_value_output_first,
+         v_op,
+         vertex_partition,
+         invalid_bucket_idx = VertexFrontierType::kInvalidBucketIdx] __device__(auto pair) {
+          auto key     = thrust::get<0>(pair);
+          auto payload = thrust::get<1>(pair);
+          vertex_t v_offset{};
+          if constexpr (std::is_same_v<key_t, vertex_t>) {
+            v_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key);
+          } else {
+            v_offset =
+              vertex_partition.get_local_vertex_offset_from_vertex_nocheck(thrust::get<0>(key));
+          }
+          auto v_val       = *(vertex_value_input_first + v_offset);
+          auto v_op_result = v_op(key, v_val, payload);
+          if (v_op_result) {
+            *(vertex_value_output_first + v_offset) = thrust::get<1>(*v_op_result);
+            return static_cast<uint8_t>(thrust::get<0>(*v_op_result));
+          } else {
+            return std::numeric_limits<uint8_t>::max();
+          }
+        });
+
+      resize_dataframe_buffer<payload_t>(payload_buffer, size_t{0}, handle.get_stream());
+      shrink_to_fit_dataframe_buffer<payload_t>(payload_buffer, handle.get_stream());
+    } else {
+      thrust::transform(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        get_dataframe_buffer_begin<key_t>(key_buffer),
+        get_dataframe_buffer_begin<key_t>(key_buffer) + num_buffer_elements,
+        bucket_indices.begin(),
+        detail::call_v_op_t<vertex_t,
+                            VertexValueInputIterator,
+                            VertexValueOutputIterator,
+                            VertexOp,
+                            key_t,
+                            GraphViewType::is_multi_gpu>{vertex_value_input_first,
+                                                         vertex_value_output_first,
+                                                         v_op,
+                                                         vertex_partition,
+                                                         VertexFrontierType::kInvalidBucketIdx});
+    }
+
+    auto bucket_key_pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(bucket_indices.begin(), get_dataframe_buffer_begin<key_t>(key_buffer)));
+    bucket_indices.resize(
+      thrust::distance(
+        bucket_key_pair_first,
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          bucket_key_pair_first,
+                          bucket_key_pair_first + num_buffer_elements,
+                          detail::check_invalid_bucket_idx_t<key_t>())),
+      handle.get_stream());
+    resize_dataframe_buffer<key_t>(key_buffer, bucket_indices.size(), handle.get_stream());
+    bucket_indices.shrink_to_fit(handle.get_stream());
+    shrink_to_fit_dataframe_buffer<key_t>(key_buffer, handle.get_stream());
+
+    frontier.insert_to_buckets(bucket_indices.begin(),
+                               bucket_indices.end(),
+                               get_dataframe_buffer_begin<key_t>(key_buffer),
+                               next_frontier_bucket_indices);
+  }
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/vertex_frontier.cuh b/cpp/include/cugraph/prims/vertex_frontier.cuh
new file mode 100644
index 00000000000..22c7ca867f5
--- /dev/null
+++ b/cpp/include/cugraph/prims/vertex_frontier.cuh
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cinttypes>
+#include <cstddef>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+// stores unique key objects in the sorted (non-descending) order; key type is either vertex_t
+// (tag_t == void) or thrust::tuple<vertex_t, tag_t> (tag_t != void)
+template <typename vertex_t, typename tag_t = void, bool is_multi_gpu = false>
+class SortedUniqueKeyBucket {
+  static_assert(std::is_same_v<tag_t, void> || std::is_arithmetic_v<tag_t>);
+
+  using optional_buffer_type = std::
+    conditional_t<std::is_same_v<tag_t, void>, std::byte /* dummy */, rmm::device_uvector<tag_t>>;
+
+ public:
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  SortedUniqueKeyBucket(raft::handle_t const& handle)
+    : handle_ptr_(&handle), vertices_(0, handle.get_stream()), tags_(std::byte{0})
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  SortedUniqueKeyBucket(raft::handle_t const& handle)
+    : handle_ptr_(&handle), vertices_(0, handle.get_stream()), tags_(0, handle.get_stream())
+  {
+  }
+
+  /**
+   * @ brief insert a vertex to the bucket
+   *
+   * @param vertex vertex to insert
+   */
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(vertex_t vertex)
+  {
+    if (vertices_.size() > 0) {
+      rmm::device_scalar<vertex_t> tmp(vertex, handle_ptr_->get_stream());
+      insert(tmp.data(), tmp.data() + 1);
+    } else {
+      vertices_.resize(1, handle_ptr_->get_stream());
+      raft::update_device(vertices_.data(), &vertex, size_t{1}, handle_ptr_->get_stream());
+    }
+  }
+
+  /**
+   * @ brief insert a (vertex, tag) pair to the bucket
+   *
+   * @param vertex vertex of the (vertex, tag) pair to insert
+   * @param tag tag of the (vertex, tag) pair to insert
+   */
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(thrust::tuple<vertex_t, tag_type> key)
+  {
+    if (vertices_.size() > 0) {
+      rmm::device_scalar<vertex_t> tmp_vertex(thrust::get<0>(key), handle_ptr_->get_stream());
+      rmm::device_scalar<tag_t> tmp_tag(thrust::get<1>(key), handle_ptr_->get_stream());
+      auto pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(tmp_vertex.data(), tmp_tag.data()));
+      insert(pair_first, pair_first + 1);
+    } else {
+      vertices_.resize(1, handle_ptr_->get_stream());
+      tags_.resize(1, handle_ptr_->get_stream());
+      auto pair_first =
+        thrust::make_tuple(thrust::make_zip_iterator(vertices_.begin(), tags_.begin()));
+      thrust::fill(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                   pair_first,
+                   pair_first + 1,
+                   key);
+    }
+  }
+
+  /**
+   * @ brief insert a list of vertices to the bucket
+   *
+   * @param vertex_first Iterator pointing to the first (inclusive) element of the vertices stored
+   * in device memory.
+   * @param vertex_last Iterator pointing to the last (exclusive) element of the vertices stored in
+   * device memory.
+   */
+  template <typename VertexIterator,
+            typename tag_type                                 = tag_t,
+            std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(VertexIterator vertex_first, VertexIterator vertex_last)
+  {
+    static_assert(
+      std::is_same_v<typename std::iterator_traits<VertexIterator>::value_type, vertex_t>);
+
+    if (vertices_.size() > 0) {
+      rmm::device_uvector<vertex_t> merged_vertices(
+        vertices_.size() + thrust::distance(vertex_first, vertex_last), handle_ptr_->get_stream());
+      thrust::merge(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                    vertices_.begin(),
+                    vertices_.end(),
+                    vertex_first,
+                    vertex_last,
+                    merged_vertices.begin());
+      merged_vertices.resize(
+        thrust::distance(
+          merged_vertices.begin(),
+          thrust::unique(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                         merged_vertices.begin(),
+                         merged_vertices.end())),
+        handle_ptr_->get_stream());
+      merged_vertices.shrink_to_fit(handle_ptr_->get_stream());
+      vertices_ = std::move(merged_vertices);
+    } else {
+      vertices_.resize(thrust::distance(vertex_first, vertex_last), handle_ptr_->get_stream());
+      thrust::copy(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                   vertex_first,
+                   vertex_last,
+                   vertices_.begin());
+    }
+  }
+
+  /**
+   * @ brief insert a list of (vertex, tag) pairs to the bucket
+   *
+   * @param key_first Iterator pointing to the first (inclusive) element of the (vertex,tag) pairs
+   * stored in device memory.
+   * @param key_last Iterator pointing to the last (exclusive) element of the  (vertex,tag) pairs
+   * stored in device memory.
+   */
+  template <typename KeyIterator,
+            typename tag_type                                  = tag_t,
+            std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(KeyIterator key_first, KeyIterator key_last)
+  {
+    static_assert(std::is_same_v<typename std::iterator_traits<KeyIterator>::value_type,
+                                 thrust::tuple<vertex_t, tag_t>>);
+
+    if (vertices_.size() > 0) {
+      rmm::device_uvector<vertex_t> merged_vertices(
+        vertices_.size() + thrust::distance(key_first, key_last), handle_ptr_->get_stream());
+      rmm::device_uvector<tag_t> merged_tags(merged_vertices.size(), handle_ptr_->get_stream());
+      auto old_pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+      auto merged_pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(merged_vertices.begin(), merged_tags.begin()));
+      thrust::merge(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                    old_pair_first,
+                    old_pair_first + vertices_.size(),
+                    key_first,
+                    key_last,
+                    merged_pair_first);
+      merged_vertices.resize(
+        thrust::distance(
+          merged_pair_first,
+          thrust::unique(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                         merged_pair_first,
+                         merged_pair_first + merged_vertices.size())),
+        handle_ptr_->get_stream());
+      merged_tags.resize(merged_vertices.size(), handle_ptr_->get_stream());
+      merged_vertices.shrink_to_fit(handle_ptr_->get_stream());
+      merged_tags.shrink_to_fit(handle_ptr_->get_stream());
+      vertices_ = std::move(merged_vertices);
+      tags_     = std::move(merged_tags);
+    } else {
+      vertices_.resize(thrust::distance(key_first, key_last), handle_ptr_->get_stream());
+      tags_.resize(thrust::distance(key_first, key_last), handle_ptr_->get_stream());
+      thrust::copy(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                   key_first,
+                   key_last,
+                   thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin())));
+    }
+  }
+
+  size_t size() const { return vertices_.size(); }
+
+  template <bool do_aggregate = is_multi_gpu>
+  std::enable_if_t<do_aggregate, size_t> aggregate_size() const
+  {
+    return host_scalar_allreduce(
+      handle_ptr_->get_comms(), vertices_.size(), handle_ptr_->get_stream());
+  }
+
+  template <bool do_aggregate = is_multi_gpu>
+  std::enable_if_t<!do_aggregate, size_t> aggregate_size() const
+  {
+    return vertices_.size();
+  }
+
+  void resize(size_t size)
+  {
+    vertices_.resize(size, handle_ptr_->get_stream());
+    if constexpr (!std::is_same_v<tag_t, void>) { tags_.resize(size, handle_ptr_->get_stream()); }
+  }
+
+  void clear() { resize(0); }
+
+  void shrink_to_fit()
+  {
+    vertices_.shrink_to_fit(handle_ptr_->get_stream());
+    if constexpr (!std::is_same_v<tag_t, void>) { tags_.shrink_to_fit(handle_ptr_->get_stream()); }
+  }
+
+// FIXME: to silence the spurious warning (missing return statement ...) due to the nvcc bug
+// (https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-
+// function-in-constexpr-if-fun)
+#if 1
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  auto const begin() const
+  {
+    return vertices_.begin();
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  auto begin()
+  {
+    return vertices_.begin();
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  auto const begin() const
+  {
+    return thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  auto begin()
+  {
+    return thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+  }
+#else
+  auto const begin() const
+  {
+    if constexpr (std::is_same_v<tag_t, void>) {
+      return vertices_.begin();
+    } else {
+      return thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+    }
+  }
+
+  auto begin()
+  {
+    if constexpr (std::is_same_v<tag_t, void>) {
+      return vertices_.begin();
+    } else {
+      return thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+    }
+  }
+#endif
+
+  auto const end() const { return begin() + vertices_.size(); }
+
+  auto end() { return begin() + vertices_.size(); }
+
+ private:
+  raft::handle_t const* handle_ptr_{nullptr};
+  rmm::device_uvector<vertex_t> vertices_;
+  optional_buffer_type tags_;
+};
+
+template <typename vertex_t,
+          typename tag_t     = void,
+          bool is_multi_gpu  = false,
+          size_t num_buckets = 1>
+class VertexFrontier {
+  static_assert(std::is_same_v<tag_t, void> || std::is_arithmetic_v<tag_t>);
+
+ public:
+  using key_type =
+    std::conditional_t<std::is_same_v<tag_t, void>, vertex_t, thrust::tuple<vertex_t, tag_t>>;
+  static size_t constexpr kNumBuckets = num_buckets;
+  static size_t constexpr kInvalidBucketIdx{std::numeric_limits<size_t>::max()};
+
+  VertexFrontier(raft::handle_t const& handle) : handle_ptr_(&handle)
+  {
+    for (size_t i = 0; i < num_buckets; ++i) {
+      buckets_.emplace_back(handle);
+    }
+  }
+
+  SortedUniqueKeyBucket<vertex_t, tag_t, is_multi_gpu>& get_bucket(size_t bucket_idx)
+  {
+    return buckets_[bucket_idx];
+  }
+
+  SortedUniqueKeyBucket<vertex_t, tag_t, is_multi_gpu> const& get_bucket(size_t bucket_idx) const
+  {
+    return buckets_[bucket_idx];
+  }
+
+  void swap_buckets(size_t bucket_idx0, size_t bucket_idx1)
+  {
+    std::swap(buckets_[bucket_idx0], buckets_[bucket_idx1]);
+  }
+
+  template <typename SplitOp>
+  void split_bucket(size_t this_bucket_idx,
+                    std::vector<size_t> const& move_to_bucket_indices,
+                    SplitOp split_op)
+  {
+    auto& this_bucket = get_bucket(this_bucket_idx);
+    if (this_bucket.size() == 0) { return; }
+
+    // 1. apply split_op to each bucket element
+
+    static_assert(kNumBuckets <= std::numeric_limits<uint8_t>::max());
+    rmm::device_uvector<uint8_t> bucket_indices(this_bucket.size(), handle_ptr_->get_stream());
+    thrust::transform(
+      rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+      this_bucket.begin(),
+      this_bucket.end(),
+      bucket_indices.begin(),
+      [split_op] __device__(auto key) {
+        auto split_op_result = split_op(key);
+        return static_cast<uint8_t>(split_op_result ? *split_op_result : kInvalidBucketIdx);
+      });
+
+    // 2. remove elements with the invalid bucket indices
+
+    auto pair_first =
+      thrust::make_zip_iterator(thrust::make_tuple(bucket_indices.begin(), this_bucket.begin()));
+    bucket_indices.resize(
+      thrust::distance(pair_first,
+                       thrust::remove_if(
+                         rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                         pair_first,
+                         pair_first + bucket_indices.size(),
+                         [] __device__(auto pair) {
+                           return thrust::get<0>(pair) == static_cast<uint8_t>(kInvalidBucketIdx);
+                         })),
+      handle_ptr_->get_stream());
+    this_bucket.resize(bucket_indices.size());
+    bucket_indices.shrink_to_fit(handle_ptr_->get_stream());
+    this_bucket.shrink_to_fit();
+
+    // 3. separte the elements to stay in this bucket from the elements to be moved to other buckets
+
+    pair_first =
+      thrust::make_zip_iterator(thrust::make_tuple(bucket_indices.begin(), this_bucket.begin()));
+    auto new_this_bucket_size = static_cast<size_t>(thrust::distance(
+      pair_first,
+      thrust::stable_partition(  // stalbe_partition to maintain sorted order within each bucket
+        rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+        pair_first,
+        pair_first + bucket_indices.size(),
+        [this_bucket_idx = static_cast<uint8_t>(this_bucket_idx)] __device__(auto pair) {
+          return thrust::get<0>(pair) == this_bucket_idx;
+        })));
+
+    // 4. insert to target buckets and resize this bucket
+
+    insert_to_buckets(bucket_indices.begin() + new_this_bucket_size,
+                      bucket_indices.end(),
+                      this_bucket.begin() + new_this_bucket_size,
+                      move_to_bucket_indices);
+
+    this_bucket.resize(new_this_bucket_size);
+    this_bucket.shrink_to_fit();
+  }
+
+  template <typename KeyIterator>
+  void insert_to_buckets(uint8_t* bucket_idx_first /* [INOUT] */,
+                         uint8_t* bucket_idx_last /* [INOUT] */,
+                         KeyIterator key_first /* [INOUT] */,
+                         std::vector<size_t> const& to_bucket_indices)
+  {
+    // 1. group the elements by their target bucket indices
+
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(bucket_idx_first, key_first));
+    auto pair_last  = pair_first + thrust::distance(bucket_idx_first, bucket_idx_last);
+
+    std::vector<size_t> insert_bucket_indices{};
+    std::vector<size_t> insert_offsets{};
+    std::vector<size_t> insert_sizes{};
+    if (to_bucket_indices.size() == 1) {
+      insert_bucket_indices = to_bucket_indices;
+      insert_offsets        = {0};
+      insert_sizes          = {static_cast<size_t>(thrust::distance(pair_first, pair_last))};
+    } else if (to_bucket_indices.size() == 2) {
+      auto next_bucket_size = static_cast<size_t>(thrust::distance(
+        pair_first,
+        thrust::stable_partition(  // stalbe_partition to maintain sorted order within each bucket
+          rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+          pair_first,
+          pair_last,
+          [next_bucket_idx = static_cast<uint8_t>(to_bucket_indices[0])] __device__(auto pair) {
+            return thrust::get<0>(pair) == next_bucket_idx;
+          })));
+      insert_bucket_indices = to_bucket_indices;
+      insert_offsets        = {0, next_bucket_size};
+      insert_sizes          = {
+        next_bucket_size,
+        static_cast<size_t>(thrust::distance(pair_first + next_bucket_size, pair_last))};
+    } else {
+      thrust::stable_sort(  // stalbe_sort to maintain sorted order within each bucket
+        rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+        pair_first,
+        pair_last,
+        [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); });
+      rmm::device_uvector<uint8_t> d_indices(to_bucket_indices.size(), handle_ptr_->get_stream());
+      rmm::device_uvector<size_t> d_counts(d_indices.size(), handle_ptr_->get_stream());
+      auto it = thrust::reduce_by_key(
+        rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+        bucket_idx_first,
+        bucket_idx_last,
+        thrust::make_constant_iterator(size_t{1}),
+        d_indices.begin(),
+        d_counts.begin());
+      d_indices.resize(thrust::distance(d_indices.begin(), thrust::get<0>(it)),
+                       handle_ptr_->get_stream());
+      d_counts.resize(d_indices.size(), handle_ptr_->get_stream());
+      std::vector<uint8_t> h_indices(d_indices.size());
+      std::vector<size_t> h_counts(h_indices.size());
+      raft::update_host(
+        h_indices.data(), d_indices.data(), d_indices.size(), handle_ptr_->get_stream());
+      raft::update_host(
+        h_counts.data(), d_counts.data(), d_counts.size(), handle_ptr_->get_stream());
+      handle_ptr_->get_stream_view().synchronize();
+
+      size_t offset{0};
+      for (size_t i = 0; i < h_indices.size(); ++i) {
+        insert_bucket_indices[i] = static_cast<size_t>(h_indices[i]);
+        insert_offsets[i]        = offset;
+        insert_sizes[i]          = h_counts[i];
+        offset += insert_sizes[i];
+      }
+    }
+
+    // 2. insert to the target buckets
+
+    for (size_t i = 0; i < insert_offsets.size(); ++i) {
+      get_bucket(insert_bucket_indices[i])
+        .insert(key_first + insert_offsets[i], key_first + (insert_offsets[i] + insert_sizes[i]));
+    }
+  }
+
+ private:
+  raft::handle_t const* handle_ptr_{nullptr};
+  std::vector<SortedUniqueKeyBucket<vertex_t, tag_t, is_multi_gpu>> buckets_{};
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/serialization/serializer.hpp b/cpp/include/cugraph/serialization/serializer.hpp
new file mode 100644
index 00000000000..240df1d304a
--- /dev/null
+++ b/cpp/include/cugraph/serialization/serializer.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <raft/handle.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cugraph {
+namespace serializer {
+
+using namespace cugraph::experimental;
+
+class serializer_t {
+ public:
+  using byte_t = uint8_t;
+
+  using device_byte_it  = typename rmm::device_uvector<byte_t>::iterator;
+  using device_byte_cit = typename rmm::device_uvector<byte_t>::const_iterator;
+
+  // cnstr. for serialize() path:
+  //
+  serializer_t(raft::handle_t const& handle, size_t total_sz_bytes)
+    : handle_(handle),
+      d_storage_(total_sz_bytes, handle.get_stream()),
+      begin_(d_storage_.begin()),
+      cbegin_(d_storage_.begin())
+  {
+  }
+
+  // cnstr. for unserialize() path:
+  //
+  serializer_t(raft::handle_t const& handle, byte_t const* ptr_d_storage)
+    : handle_(handle), d_storage_(0, handle.get_stream()), cbegin_(ptr_d_storage)
+  {
+  }
+
+  template <typename graph_t, typename Enable = void>
+  struct graph_meta_t;
+
+  template <typename graph_t>
+  struct graph_meta_t<graph_t, std::enable_if_t<graph_t::is_multi_gpu>> {
+    // purposely empty, for now;
+    // FIXME: provide implementation for multi-gpu version
+  };
+
+  template <typename graph_t>
+  struct graph_meta_t<graph_t, std::enable_if_t<!graph_t::is_multi_gpu>> {
+    using vertex_t   = typename graph_t::vertex_type;
+    using bool_ser_t = uint8_t;
+
+    graph_meta_t(void) {}
+
+    explicit graph_meta_t(graph_t const& graph)
+      : num_vertices_(graph.get_number_of_vertices()),
+        num_edges_(graph.get_number_of_edges()),
+        properties_(graph.get_graph_properties()),
+        is_weighted_(graph.is_weighted()),
+        segment_offsets_(graph.view().get_local_adj_matrix_partition_segment_offsets(0))
+    {
+    }
+
+    graph_meta_t(size_t num_vertices,
+                 size_t num_edges,
+                 graph_properties_t const& properties,
+                 bool is_weighted,
+                 std::optional<std::vector<vertex_t>> const& segment_offsets)
+      : num_vertices_(num_vertices),
+        num_edges_(num_edges),
+        properties_(properties),
+        is_weighted_(is_weighted),
+        segment_offsets_(segment_offsets)
+    {
+    }
+
+    size_t num_vertices_;
+    size_t num_edges_;
+    graph_properties_t properties_{};
+    bool is_weighted_{};
+    std::optional<std::vector<vertex_t>> segment_offsets_{};
+
+    size_t get_device_sz_bytes(void) const
+    {
+      return 2 * sizeof(size_t) +
+             (segment_offsets_ ? (*segment_offsets_).size() : size_t{0}) * sizeof(vertex_t) +
+             3 * sizeof(bool_ser_t);
+    }
+  };
+
+  // POD-type serialization:
+  //
+  template <typename value_t>
+  void serialize(value_t val);
+
+  // POD-type unserialization:
+  //
+  template <typename value_t>
+  value_t unserialize(void);
+
+  // device array serialization:
+  //
+  template <typename value_t>
+  void serialize(value_t const* p_d_src, size_t size);
+
+  // device vector unserialization;
+  // extracts device_uvector of `size` bytes_to_value_t elements:
+  //
+  template <typename value_t>
+  rmm::device_uvector<value_t> unserialize(
+    size_t size);  // size of device vector to be unserialized
+
+  // graph serialization,
+  // with device storage and host metadata:
+  // (associated with target; e.g., num_vertices, etc.)
+  //
+  template <typename graph_t>
+  void serialize(graph_t const& graph, graph_meta_t<graph_t>& gmeta);  // serialization target
+
+  // graph unserialization,
+  // with device storage and host metadata:
+  // (associated with target; e.g., num_vertices, etc.)
+  //
+  template <typename graph_t>
+  graph_t unserialize(size_t device_sz_bytes, size_t host_sz_bytes);
+
+  template <typename graph_t>
+  static std::pair<size_t, size_t> get_device_graph_sz_bytes(
+    graph_meta_t<graph_t> const& graph_meta)
+  {
+    using vertex_t = typename graph_t::vertex_type;
+    using edge_t   = typename graph_t::edge_type;
+    using weight_t = typename graph_t::weight_type;
+
+    if constexpr (!graph_t::is_multi_gpu) {
+      size_t num_vertices = graph_meta.num_vertices_;
+      size_t num_edges    = graph_meta.num_edges_;
+
+      size_t weight_storage_sz = graph_meta.is_weighted_ ? num_edges * sizeof(weight_t) : 0;
+
+      size_t device_ser_sz =
+        (num_vertices + 1) * sizeof(edge_t) + num_edges * sizeof(vertex_t) + weight_storage_sz;
+
+      size_t host_ser_sz = graph_meta.get_device_sz_bytes();
+
+      return std::make_pair(
+        device_ser_sz,
+        host_ser_sz);  // FIXME: remove when host_bcast() becomes available for host vectors
+
+    } else {
+      CUGRAPH_FAIL("Unsupported graph type for un/serialization.");
+
+      return std::pair<size_t, size_t>{};
+    }
+  }
+
+  template <typename graph_t>
+  static std::pair<size_t, size_t> get_device_graph_sz_bytes(graph_t const& graph)
+  {
+    graph_meta_t<graph_t> gmeta{graph};
+    return get_device_graph_sz_bytes(gmeta);
+  }
+
+  byte_t const* get_storage(void) const { return d_storage_.begin(); }
+  byte_t* get_storage(void) { return d_storage_.begin(); }
+
+ private:
+  // serialization of graph metadata, via device orchestration:
+  //
+  template <typename graph_t>
+  void serialize(graph_meta_t<graph_t> const& graph_meta);
+
+  // unserialization of graph metadata, via device orchestration:
+  //
+  template <typename graph_t>
+  graph_meta_t<graph_t> unserialize(
+    size_t graph_meta_sz_bytes,
+    graph_meta_t<graph_t> const& empty_meta);  // tag dispatching to avoid conflict with
+                                               // `unserialize(size_t)` for device vectors
+
+  raft::handle_t const& handle_;
+  rmm::device_uvector<byte_t> d_storage_;
+  device_byte_it begin_{nullptr};    // advances on serialize()
+  device_byte_cit cbegin_{nullptr};  // advances on unserialize()
+};
+
+}  // namespace serializer
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/collect_comm.cuh b/cpp/include/cugraph/utilities/collect_comm.cuh
new file mode 100644
index 00000000000..1e15afea1e5
--- /dev/null
+++ b/cpp/include/cugraph/utilities/collect_comm.cuh
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <cuco/static_map.cuh>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <thrust/distance.h>
+
+#include <iterator>
+#include <memory>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+// for key = [map_key_first, map_key_last), key_to_gpu_id_op(key) should be coincide with
+// comm.get_rank()
+template <typename VertexIterator0,
+          typename VertexIterator1,
+          typename ValueIterator,
+          typename KeyToGPUIdOp>
+decltype(allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+  0, cudaStream_t{nullptr}))
+collect_values_for_keys(raft::comms::comms_t const& comm,
+                        VertexIterator0 map_key_first,
+                        VertexIterator0 map_key_last,
+                        ValueIterator map_value_first,
+                        VertexIterator1 collect_key_first,
+                        VertexIterator1 collect_key_last,
+                        KeyToGPUIdOp key_to_gpu_id_op,
+                        rmm::cuda_stream_view stream_view)
+{
+  using vertex_t = typename std::iterator_traits<VertexIterator0>::value_type;
+  static_assert(
+    std::is_same<typename std::iterator_traits<VertexIterator1>::value_type, vertex_t>::value);
+  using value_t = typename std::iterator_traits<ValueIterator>::value_type;
+
+  double constexpr load_factor = 0.7;
+
+  // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary
+  // search based approach (especially when thrust::distance(collect_key_first, collect_key_last) <<
+  // thrust::distance(map_key_first, map_key_last)
+
+  // 1. build a cuco::static_map object for the map k, v pairs.
+
+  auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+  auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, stream_view);
+  auto kv_map_ptr     = std::make_unique<
+    cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+    // cuco::static_map requires at least one empty slot
+    std::max(static_cast<size_t>(
+               static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
+             static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value,
+    stream_adapter);
+  {
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first));
+    kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+  }
+
+  // 2. collect values for the unique keys in [collect_key_first, collect_key_last)
+
+  rmm::device_uvector<vertex_t> unique_keys(thrust::distance(collect_key_first, collect_key_last),
+                                            stream_view);
+  thrust::copy(
+    rmm::exec_policy(stream_view), collect_key_first, collect_key_last, unique_keys.begin());
+  thrust::sort(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end());
+  unique_keys.resize(
+    thrust::distance(
+      unique_keys.begin(),
+      thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())),
+    stream_view);
+
+  rmm::device_uvector<value_t> values_for_unique_keys(0, stream_view);
+  {
+    rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
+    std::vector<size_t> rx_value_counts{};
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+      comm,
+      unique_keys.begin(),
+      unique_keys.end(),
+      [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); },
+      stream_view);
+
+    rmm::device_uvector<value_t> values_for_rx_unique_keys(rx_unique_keys.size(), stream_view);
+
+    stream_view.synchronize();  // cuco::static_map currently does not take stream
+
+    kv_map_ptr->find(
+      rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin());
+
+    rmm::device_uvector<value_t> rx_values_for_unique_keys(0, stream_view);
+    std::tie(rx_values_for_unique_keys, std::ignore) =
+      shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream_view);
+
+    values_for_unique_keys = std::move(rx_values_for_unique_keys);
+  }
+
+  // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys,
+  // values_for_unique_keys.
+
+  stream_view.synchronize();  // cuco::static_map currently does not take stream
+
+  kv_map_ptr.reset();
+
+  kv_map_ptr = std::make_unique<
+    cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+    // cuco::static_map requires at least one empty slot
+    std::max(static_cast<size_t>(static_cast<double>(unique_keys.size()) / load_factor),
+             unique_keys.size() + 1),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value,
+    stream_adapter);
+  {
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(unique_keys.begin(), values_for_unique_keys.begin()));
+    kv_map_ptr->insert(pair_first, pair_first + unique_keys.size());
+  }
+
+  // 4. find values for [collect_key_first, collect_key_last)
+
+  auto value_buffer = allocate_dataframe_buffer<value_t>(
+    thrust::distance(collect_key_first, collect_key_last), stream_view);
+  kv_map_ptr->find(
+    collect_key_first, collect_key_last, get_dataframe_buffer_begin<value_t>(value_buffer));
+
+  return value_buffer;
+}
+
+// for key = [map_key_first, map_key_last), key_to_gpu_id_op(key) should be coincide with
+// comm.get_rank()
+template <typename VertexIterator0,
+          typename VertexIterator1,
+          typename ValueIterator,
+          typename KeyToGPUIdOp>
+decltype(allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+  0, cudaStream_t{nullptr}))
+collect_values_for_unique_keys(raft::comms::comms_t const& comm,
+                               VertexIterator0 map_key_first,
+                               VertexIterator0 map_key_last,
+                               ValueIterator map_value_first,
+                               VertexIterator1 collect_unique_key_first,
+                               VertexIterator1 collect_unique_key_last,
+                               KeyToGPUIdOp key_to_gpu_id_op,
+                               rmm::cuda_stream_view stream_view)
+{
+  using vertex_t = typename std::iterator_traits<VertexIterator0>::value_type;
+  static_assert(
+    std::is_same<typename std::iterator_traits<VertexIterator1>::value_type, vertex_t>::value);
+  using value_t = typename std::iterator_traits<ValueIterator>::value_type;
+
+  double constexpr load_factor = 0.7;
+
+  // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary
+  // search based approach (especially when thrust::distance(collect_unique_key_first,
+  // collect_unique_key_last) << thrust::distance(map_key_first, map_key_last)
+
+  // 1. build a cuco::static_map object for the map k, v pairs.
+
+  auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+  auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, stream_view);
+  auto kv_map_ptr     = std::make_unique<
+    cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+    // cuco::static_map requires at least one empty slot
+    std::max(static_cast<size_t>(
+               static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
+             static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value,
+    stream_adapter);
+  {
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first));
+    kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+  }
+
+  // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last)
+
+  rmm::device_uvector<vertex_t> unique_keys(
+    thrust::distance(collect_unique_key_first, collect_unique_key_last), stream_view);
+  thrust::copy(rmm::exec_policy(stream_view),
+               collect_unique_key_first,
+               collect_unique_key_last,
+               unique_keys.begin());
+
+  rmm::device_uvector<value_t> values_for_unique_keys(0, stream_view);
+  {
+    rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
+    std::vector<size_t> rx_value_counts{};
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+      comm,
+      unique_keys.begin(),
+      unique_keys.end(),
+      [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); },
+      stream_view);
+
+    rmm::device_uvector<value_t> values_for_rx_unique_keys(rx_unique_keys.size(), stream_view);
+
+    stream_view.synchronize();  // cuco::static_map currently does not take stream
+
+    kv_map_ptr->find(
+      rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin());
+
+    rmm::device_uvector<value_t> rx_values_for_unique_keys(0, stream_view);
+    std::tie(rx_values_for_unique_keys, std::ignore) =
+      shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream_view);
+
+    values_for_unique_keys = std::move(rx_values_for_unique_keys);
+  }
+
+  // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys,
+  // values_for_unique_keys.
+
+  stream_view.synchronize();  // cuco::static_map currently does not take stream
+
+  kv_map_ptr.reset();
+
+  kv_map_ptr = std::make_unique<
+    cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+    // cuco::static_map requires at least one empty slot
+    std::max(static_cast<size_t>(static_cast<double>(unique_keys.size()) / load_factor),
+             unique_keys.size() + 1),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value,
+    stream_adapter);
+  {
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(unique_keys.begin(), values_for_unique_keys.begin()));
+    kv_map_ptr->insert(pair_first, pair_first + unique_keys.size());
+  }
+
+  // 4. find values for [collect_unique_key_first, collect_unique_key_last)
+
+  auto value_buffer = allocate_dataframe_buffer<value_t>(
+    thrust::distance(collect_unique_key_first, collect_unique_key_last), stream_view);
+  kv_map_ptr->find(collect_unique_key_first,
+                   collect_unique_key_last,
+                   get_dataframe_buffer_begin<value_t>(value_buffer));
+
+  return value_buffer;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
new file mode 100644
index 00000000000..2bd4e86aa7e
--- /dev/null
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/graph_traits.hpp>
+
+#include <raft/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace cython {
+
+enum class numberTypeEnum : int { int32Type, int64Type, floatType, doubleType };
+
+enum class graphTypeEnum : int {
+  // represents unintiialized or NULL ptr
+  null,
+  // represents some legacy Cxx type. This and other LegacyCxx values are not
+  // used for the unique_ptr in a graph_container_t, but instead for when this
+  // enum is used for determining high-level code paths to take to prevent
+  // needing to expose each legacy enum value to cython.
+  LegacyCSR,
+  LegacyCSC,
+  LegacyCOO,
+  // represents that a GraphCxxView* unique_ptr type is present in a
+  // graph_container_t.
+  GraphCSRViewFloat,
+  GraphCSRViewDouble,
+  GraphCSCViewFloat,
+  GraphCSCViewDouble,
+  GraphCOOViewFloat,
+  GraphCOOViewDouble,
+  // represents values present in the graph_container_t to construct a graph_t,
+  // but unlike legacy classes does not mean a graph_t unique_ptr is present in
+  // the container.
+  graph_t,
+};
+
+// "container" for a graph type instance which insulates the owner from the
+// specifics of the actual graph type. This is intended to be used in Cython
+// code that only needs to pass a graph object to another wrapped C++ API. This
+// greatly simplifies the Cython code since the Cython definition only needs to
+// define the container and not the various individual graph types in Cython.
+struct graph_container_t {
+  // FIXME: This union is in place only to support legacy calls, remove when
+  // migration to graph_t types is complete, or when legacy graph objects are
+  // constructed in the call_<<algo> wrappers instead of the
+  // populate_graph_container_legacy() function.
+  union graphPtrUnion {
+    ~graphPtrUnion() {}
+
+    void* null;
+    std::unique_ptr<legacy::GraphCSRView<int32_t, int32_t, float>> GraphCSRViewFloatPtr;
+    std::unique_ptr<legacy::GraphCSRView<int32_t, int32_t, double>> GraphCSRViewDoublePtr;
+    std::unique_ptr<legacy::GraphCSCView<int32_t, int32_t, float>> GraphCSCViewFloatPtr;
+    std::unique_ptr<legacy::GraphCSCView<int32_t, int32_t, double>> GraphCSCViewDoublePtr;
+    std::unique_ptr<legacy::GraphCOOView<int32_t, int32_t, float>> GraphCOOViewFloatPtr;
+    std::unique_ptr<legacy::GraphCOOView<int32_t, int32_t, double>> GraphCOOViewDoublePtr;
+  };
+
+  graph_container_t() : graph_ptr_union{nullptr}, graph_type{graphTypeEnum::null} {}
+  ~graph_container_t() {}
+
+  // The expected usage of a graph_container_t is for it to be created as part
+  // of a cython wrapper simply for passing a templated instantiation of a
+  // particular graph class from one call to another, and not to exist outside
+  // of the individual wrapper function (deleted when the instance goes out of
+  // scope once the wrapper function returns). Therefore, copys and assignments
+  // to an instance are not supported and these methods are deleted.
+  graph_container_t(const graph_container_t&) = delete;
+  graph_container_t& operator=(const graph_container_t&) = delete;
+
+  graphPtrUnion graph_ptr_union;
+  graphTypeEnum graph_type;
+
+  // primitive data used for constructing graph_t instances.
+  void* src_vertices;
+  void* dst_vertices;
+  void* weights;
+  bool is_weighted;
+  void* vertex_partition_offsets;
+  void* segment_offsets;
+  size_t num_segments;
+
+  size_t num_local_edges;
+  size_t num_global_vertices;
+  size_t num_global_edges;
+  numberTypeEnum vertexType;
+  numberTypeEnum edgeType;
+  numberTypeEnum weightType;
+  bool transposed;
+  bool is_multi_gpu;
+  bool do_expensive_check;
+  int row_comm_size;
+  int col_comm_size;
+  int row_comm_rank;
+  int col_comm_rank;
+  experimental::graph_properties_t graph_props;
+};
+
+/**
+ * @brief     Owning struct. Allows returning multiple edge lists and edge offsets.
+ *            cython only
+ *
+ * @param  number_of_vertices    The total number of vertices
+ * @param  number_of_edges       The total number of edges (number of elements in src_indices,
+ dst_indices and edge_data)
+ * @param  number_of_subgraph    The number of subgraphs, number of elements in subgraph_offsets - 1
+ * @param  source_indices        This array of size E (number of edges) contains
+ * the index of the
+ * source for each edge. Indices must be in the range [0, V-1].
+ * @param  destination_indices   This array of size E (number of edges) contains
+ * the index of the
+ * destination for each edge. Indices must be in the range [0, V-1].
+ * @param  edge_data             This array size E (number of edges) contains
+ * the weight for each
+ * edge.  This array can be null in which case the graph is considered
+ * unweighted.
+ * @param  subgraph_offsets            This array size number_of_subgraph + 1 contains edge offsets
+ for each subgraph
+
+
+ */
+struct cy_multi_edgelists_t {
+  size_t number_of_vertices;
+  size_t number_of_edges;
+  size_t number_of_subgraph;
+  std::unique_ptr<rmm::device_buffer> src_indices;
+  std::unique_ptr<rmm::device_buffer> dst_indices;
+  std::unique_ptr<rmm::device_buffer> edge_data;
+  std::unique_ptr<rmm::device_buffer> subgraph_offsets;
+};
+
+// replacement for std::tuple<,,>, since std::tuple is not
+// supported in cython
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct major_minor_weights_t {
+  explicit major_minor_weights_t(raft::handle_t const& handle)
+    : shuffled_major_vertices_(0, handle.get_stream()),
+      shuffled_minor_vertices_(0, handle.get_stream()),
+      shuffled_weights_(0, handle.get_stream())
+  {
+  }
+
+  rmm::device_uvector<vertex_t>& get_major(void) { return shuffled_major_vertices_; }
+
+  rmm::device_uvector<vertex_t>& get_minor(void) { return shuffled_minor_vertices_; }
+
+  rmm::device_uvector<weight_t>& get_weights(void) { return shuffled_weights_; }
+
+  std::vector<edge_t>& get_edge_counts(void) { return edge_counts_; }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_major_wrap(
+    void)  // const: triggers errors in Cython autogen-ed C++
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_major_vertices_.release()),
+                          sizeof(vertex_t));
+  }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_minor_wrap(void)  // const
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_minor_vertices_.release()),
+                          sizeof(vertex_t));
+  }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_weights_wrap(void)  // const
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_weights_.release()),
+                          sizeof(weight_t));
+  }
+
+  std::unique_ptr<std::vector<edge_t>> get_edge_counts_wrap(void)  // const
+  {
+    return std::make_unique<std::vector<edge_t>>(edge_counts_);
+  }
+
+ private:
+  rmm::device_uvector<vertex_t> shuffled_major_vertices_;
+  rmm::device_uvector<vertex_t> shuffled_minor_vertices_;
+  rmm::device_uvector<weight_t> shuffled_weights_;
+  std::vector<edge_t> edge_counts_{};
+};
+
+// aggregate for random_walks() return type
+// to be exposed to cython:
+//
+struct random_walk_ret_t {
+  size_t coalesced_sz_v_;
+  size_t coalesced_sz_w_;
+  size_t num_paths_;
+  size_t max_depth_;
+  std::unique_ptr<rmm::device_buffer> d_coalesced_v_;
+  std::unique_ptr<rmm::device_buffer> d_coalesced_w_;
+  std::unique_ptr<rmm::device_buffer> d_sizes_;
+};
+
+struct random_walk_path_t {
+  std::unique_ptr<rmm::device_buffer> d_v_offsets;
+  std::unique_ptr<rmm::device_buffer> d_w_sizes;
+  std::unique_ptr<rmm::device_buffer> d_w_offsets;
+};
+
+struct graph_generator_t {
+  std::unique_ptr<rmm::device_buffer> d_source;
+  std::unique_ptr<rmm::device_buffer> d_destination;
+};
+
+// enum class generator_distribution_t { POWER_LAW = 0, UNIFORM };
+// aggregate for random_walks() COO return type
+// to be exposed to cython:
+//
+struct random_walk_coo_t {
+  size_t num_edges_;    // total number of COO triplets (for all paths)
+  size_t num_offsets_;  // offsets of where each COO set starts for each path;
+                        // NOTE: this can differ than num_paths_,
+                        // because paths with 0 edges (one vertex)
+                        // don't participate to the COO
+
+  std::unique_ptr<rmm::device_buffer>
+    d_src_;  // coalesced set of COO source vertices; |d_src_| = num_edges_
+  std::unique_ptr<rmm::device_buffer>
+    d_dst_;  // coalesced set of COO destination vertices; |d_dst_| = num_edges_
+  std::unique_ptr<rmm::device_buffer>
+    d_weights_;  // coalesced set of COO edge weights; |d_weights_| = num_edges_
+  std::unique_ptr<rmm::device_buffer>
+    d_offsets_;  // offsets where each COO subset for each path starts; |d_offsets_| = num_offsets_
+};
+
+// wrapper for renumber_edgelist() return
+// (unrenumbering maps, etc.)
+//
+template <typename vertex_t, typename edge_t>
+struct renum_tuple_t {
+  explicit renum_tuple_t(raft::handle_t const& handle) : dv_(0, handle.get_stream()), part_() {}
+
+  rmm::device_uvector<vertex_t>& get_dv(void) { return dv_; }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_dv_wrap(
+    void)  // const: see above explanation
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(dv_.release()), sizeof(vertex_t));
+  }
+
+  cugraph::experimental::partition_t<vertex_t>& get_partition(void) { return part_; }
+  vertex_t& get_num_vertices(void) { return nv_; }
+  edge_t& get_num_edges(void) { return ne_; }
+
+  std::vector<vertex_t>& get_segment_offsets(void) { return segment_offsets_; }
+
+  std::unique_ptr<std::vector<vertex_t>> get_segment_offsets_wrap()
+  {  // const
+    return std::make_unique<std::vector<vertex_t>>(segment_offsets_);
+  }
+
+  // `partition_t` pass-through getters
+  //
+  int get_part_row_size() const { return part_.get_row_size(); }
+
+  int get_part_col_size() const { return part_.get_col_size(); }
+
+  int get_part_comm_rank() const { return part_.get_comm_rank(); }
+
+  // FIXME: part_.get_vertex_partition_offsets() returns a std::vector
+  //
+  std::unique_ptr<std::vector<vertex_t>> get_partition_offsets_wrap(void)  // const
+  {
+    return std::make_unique<std::vector<vertex_t>>(part_.get_vertex_partition_offsets());
+  }
+
+  std::pair<vertex_t, vertex_t> get_part_local_vertex_range() const
+  {
+    auto tpl_v = part_.get_local_vertex_range();
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_local_vertex_first() const { return part_.get_local_vertex_first(); }
+
+  vertex_t get_part_local_vertex_last() const { return part_.get_local_vertex_last(); }
+
+  std::pair<vertex_t, vertex_t> get_part_vertex_partition_range(size_t vertex_partition_idx) const
+  {
+    auto tpl_v = part_.get_vertex_partition_range(vertex_partition_idx);
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_vertex_partition_first(size_t vertex_partition_idx) const
+  {
+    return part_.get_vertex_partition_first(vertex_partition_idx);
+  }
+
+  vertex_t get_part_vertex_partition_last(size_t vertex_partition_idx) const
+  {
+    return part_.get_vertex_partition_last(vertex_partition_idx);
+  }
+
+  vertex_t get_part_vertex_partition_size(size_t vertex_partition_idx) const
+  {
+    return part_.get_vertex_partition_size(vertex_partition_idx);
+  }
+
+  size_t get_part_number_of_matrix_partitions() const
+  {
+    return part_.get_number_of_matrix_partitions();
+  }
+
+  std::pair<vertex_t, vertex_t> get_part_matrix_partition_major_range(size_t partition_idx) const
+  {
+    auto tpl_v = part_.get_matrix_partition_major_range(partition_idx);
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_matrix_partition_major_first(size_t partition_idx) const
+  {
+    return part_.get_matrix_partition_major_first(partition_idx);
+  }
+
+  vertex_t get_part_matrix_partition_major_last(size_t partition_idx) const
+  {
+    return part_.get_matrix_partition_major_last(partition_idx);
+  }
+
+  vertex_t get_part_matrix_partition_major_value_start_offset(size_t partition_idx) const
+  {
+    return part_.get_part_matrix_partition_major_value_start_offset(partition_idx);
+  }
+
+  std::pair<vertex_t, vertex_t> get_part_matrix_partition_minor_range() const
+  {
+    auto tpl_v = part_.get_matrix_partition_minor_range();
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_matrix_partition_minor_first() const
+  {
+    return part_.get_matrix_partition_minor_first();
+  }
+
+  vertex_t get_part_matrix_partition_minor_last() const
+  {
+    return part_.get_matrix_partition_minor_last();
+  }
+
+ private:
+  rmm::device_uvector<vertex_t> dv_;
+  cugraph::experimental::partition_t<vertex_t> part_;
+  vertex_t nv_{0};
+  edge_t ne_{0};
+  std::vector<vertex_t> segment_offsets_;
+};
+
+// FIXME: finish description for vertex_partition_offsets
+//
+// Factory function for populating an empty graph container with a new graph
+// object from basic types, and sets the corresponding meta-data. Args are:
+//
+// graph_container_t& graph_container
+//   Reference to the graph_container_t instance to
+//   populate. populate_graph_container() can only be called on an "empty"
+//   container (ie. a container that has not been previously populated by
+//   populate_graph_container())
+//
+// graphTypeEnum legacyType
+//   Specifies the type of graph when instantiating a legacy graph type
+//   (GraphCSRViewFloat, etc.).
+//   NOTE: this parameter will be removed when the transition to exclusinve use
+//   of the new 2D graph classes is complete.
+//
+// raft::handle_t const& handle
+//   Raft handle to be set on the new graph instance in the container
+//
+// void* src_vertices, dst_vertices, weights
+//   Pointer to an array of values representing source and destination vertices,
+//   and edge weights respectively. The value types of the array are specified
+//   using numberTypeEnum values separately (see below). offsets should be size
+//   num_vertices+1, indices should be size num_edges, weights should also be
+//   size num_edges
+//
+// void* vertex_partition_offsets
+//   Pointer to an array of vertexType values representing offsets into the
+//   individual partitions for a multi-GPU paritioned graph. The offsets are used for ...
+//
+// numberTypeEnum vertexType, edgeType, weightType
+//   numberTypeEnum enum value describing the data type for the vertices,
+//   offsets, and weights arrays respectively. These enum values are used to
+//   instantiate the proper templated graph type and for casting the arrays
+//   accordingly.
+//
+// int num_vertices, num_edges
+//   The number of vertices and edges respectively in the graph represented by
+//   the above arrays.
+//
+// bool is_weighted
+//   true if the resulting graph object should store edge weights
+//
+// bool transposed
+//   true if the resulting graph object should store a transposed adjacency
+//   matrix
+//
+// bool multi_gpu
+//   true if the resulting graph object is to be used for a multi-gpu
+//   application
+void populate_graph_container(graph_container_t& graph_container,
+                              raft::handle_t& handle,
+                              void* src_vertices,
+                              void* dst_vertices,
+                              void* weights,
+                              void* vertex_partition_offsets,
+                              void* segment_offsets,
+                              size_t num_segments,
+                              numberTypeEnum vertexType,
+                              numberTypeEnum edgeType,
+                              numberTypeEnum weightType,
+                              size_t num_local_edges,
+                              size_t num_global_vertices,
+                              size_t num_global_edges,
+                              bool is_weighted,
+                              bool is_symmetric,
+                              bool transposed,
+                              bool multi_gpu);
+
+// FIXME: comment this function
+// FIXME: Should local_* values be void* as well?
+void populate_graph_container_legacy(graph_container_t& graph_container,
+                                     graphTypeEnum legacyType,
+                                     raft::handle_t const& handle,
+                                     void* offsets,
+                                     void* indices,
+                                     void* weights,
+                                     numberTypeEnum offsetType,
+                                     numberTypeEnum indexType,
+                                     numberTypeEnum weightType,
+                                     size_t num_global_vertices,
+                                     size_t num_global_edges,
+                                     int* local_vertices,
+                                     int* local_edges,
+                                     int* local_offsets);
+
+// Wrapper for calling Louvain using a graph container
+template <typename weight_t>
+std::pair<size_t, weight_t> call_louvain(raft::handle_t const& handle,
+                                         graph_container_t const& graph_container,
+                                         void* identifiers,
+                                         void* parts,
+                                         size_t max_level,
+                                         weight_t resolution);
+
+// Wrapper for calling Pagerank using a graph container
+template <typename vertex_t, typename weight_t>
+void call_pagerank(raft::handle_t const& handle,
+                   graph_container_t const& graph_container,
+                   vertex_t* identifiers,
+                   weight_t* pagerank,
+                   vertex_t personalization_subset_size,
+                   vertex_t* personalization_subset,
+                   weight_t* personalization_values,
+                   double alpha,
+                   double tolerance,
+                   int64_t max_iter,
+                   bool has_guess);
+
+// Wrapper for calling Katz centrality using a graph container
+template <typename vertex_t, typename weight_t>
+void call_katz_centrality(raft::handle_t const& handle,
+                          graph_container_t const& graph_container,
+                          vertex_t* identifiers,
+                          weight_t* katz_centrality,
+                          double alpha,
+                          double beta,
+                          double tolerance,
+                          int64_t max_iter,
+                          bool normalized,
+                          bool has_guess);
+
+// Wrapper for calling BFS through a graph container
+template <typename vertex_t, typename weight_t>
+void call_bfs(raft::handle_t const& handle,
+              graph_container_t const& graph_container,
+              vertex_t* identifiers,
+              vertex_t* distances,
+              vertex_t* predecessors,
+              vertex_t depth_limit,
+              const vertex_t start_vertex,
+              bool direction_optimizing);
+
+// Wrapper for calling SSSP through a graph container
+template <typename vertex_t, typename weight_t>
+void call_sssp(raft::handle_t const& handle,
+               graph_container_t const& graph_container,
+               vertex_t* identifiers,
+               weight_t* distances,
+               vertex_t* predecessors,
+               const vertex_t source_vertex);
+
+// Wrapper for calling egonet through a graph container
+template <typename vertex_t, typename weight_t>
+std::unique_ptr<cy_multi_edgelists_t> call_egonet(raft::handle_t const& handle,
+                                                  graph_container_t const& graph_container,
+                                                  vertex_t* source_vertex,
+                                                  vertex_t n_subgraphs,
+                                                  vertex_t radius);
+
+// Wrapper for calling WCC through a graph container
+template <typename vertex_t, typename weight_t>
+void call_wcc(raft::handle_t const& handle,
+              graph_container_t const& graph_container,
+              vertex_t* components);
+
+// Wrapper for calling graph generator
+template <typename vertex_t>
+std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist(raft::handle_t const& handle,
+                                                               size_t scale,
+                                                               size_t num_edges,
+                                                               double a,
+                                                               double b,
+                                                               double c,
+                                                               uint64_t seed,
+                                                               bool clip_and_flip,
+                                                               bool scramble_vertex_ids);
+template <typename vertex_t>
+std::vector<std::pair<std::unique_ptr<rmm::device_buffer>, std::unique_ptr<rmm::device_buffer>>>
+call_generate_rmat_edgelists(raft::handle_t const& handle,
+                             size_t n_edgelists,
+                             size_t min_scale,
+                             size_t max_scale,
+                             size_t edge_factor,
+                             cugraph::generator_distribution_t size_distribution,
+                             cugraph::generator_distribution_t edge_distribution,
+                             uint64_t seed,
+                             bool clip_and_flip,
+                             bool scramble_vertex_ids);
+
+// wrapper for random_walks.
+//
+template <typename vertex_t, typename edge_t>
+std::enable_if_t<cugraph::experimental::is_vertex_edge_combo<vertex_t, edge_t>::value,
+                 std::unique_ptr<random_walk_ret_t>>
+call_random_walks(raft::handle_t const& handle,
+                  graph_container_t const& graph_container,
+                  vertex_t const* ptr_start_set,
+                  edge_t num_paths,
+                  edge_t max_depth,
+                  bool use_padding);
+
+template <typename index_t>
+std::unique_ptr<random_walk_path_t> call_rw_paths(raft::handle_t const& handle,
+                                                  index_t num_paths,
+                                                  index_t const* vertex_path_sizes);
+
+// convertor from random_walks return type to COO:
+//
+template <typename vertex_t, typename index_t>
+std::unique_ptr<random_walk_coo_t> random_walks_to_coo(raft::handle_t const& handle,
+                                                       random_walk_ret_t& rw_ret);
+
+// wrapper for shuffling:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
+  raft::handle_t const& handle,
+  vertex_t*
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+  vertex_t* edgelist_minor_vertices,  // [IN / OUT]
+  weight_t* edgelist_weights,         // [IN / OUT]
+  edge_t num_edgelist_edges);
+
+// Wrapper for calling renumber_edeglist() inplace:
+//
+template <typename vertex_t, typename edge_t>
+std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
+  raft::handle_t const& handle,
+  vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  std::vector<edge_t> const& edge_counts,
+  bool do_expensive_check,
+  bool multi_gpu);
+
+// Helper for setting up subcommunicators, typically called as part of the
+// user-initiated comms initialization in Python.
+//
+// raft::handle_t& handle
+//   Raft handle for which the new subcommunicators will be created. The
+//   subcommunicators will then be accessible from the handle passed to the
+//   parallel processes.
+//
+// size_t row_comm_size
+//   Number of items in a partition row (ie. pcols), needed for creating the
+//   appropriate number of subcommunicator instances.
+void init_subcomms(raft::handle_t& handle, size_t row_comm_size);
+
+}  // namespace cython
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.cuh b/cpp/include/cugraph/utilities/dataframe_buffer.cuh
new file mode 100644
index 00000000000..d730a3afcff
--- /dev/null
+++ b/cpp/include/cugraph/utilities/dataframe_buffer.cuh
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename TupleType, size_t I>
+auto allocate_dataframe_buffer_tuple_element_impl(size_t buffer_size,
+                                                  rmm::cuda_stream_view stream_view)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return rmm::device_uvector<element_t>(buffer_size, stream_view);
+}
+
+template <typename TupleType, size_t... Is>
+auto allocate_dataframe_buffer_tuple_impl(std::index_sequence<Is...>,
+                                          size_t buffer_size,
+                                          rmm::cuda_stream_view stream_view)
+{
+  return std::make_tuple(
+    allocate_dataframe_buffer_tuple_element_impl<TupleType, Is>(buffer_size, stream_view)...);
+}
+
+template <typename TupleType, typename BufferType, size_t I, size_t N>
+struct resize_dataframe_buffer_tuple_iterator_element_impl {
+  void run(BufferType& buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view)
+  {
+    std::get<I>(buffer).resize(new_buffer_size, stream_view);
+    resize_dataframe_buffer_tuple_iterator_element_impl<TupleType, BufferType, I + 1, N>().run(
+      buffer, new_buffer_size, stream_view);
+  }
+};
+
+template <typename TupleType, typename BufferType, size_t I>
+struct resize_dataframe_buffer_tuple_iterator_element_impl<TupleType, BufferType, I, I> {
+  void run(BufferType& buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) {}
+};
+
+template <typename TupleType, typename BufferType, size_t I, size_t N>
+struct shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl {
+  void run(BufferType& buffer, rmm::cuda_stream_view stream_view)
+  {
+    std::get<I>(buffer).shrink_to_fit(stream_view);
+    shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl<TupleType, BufferType, I + 1, N>()
+      .run(buffer, stream_view);
+  }
+};
+
+template <typename TupleType, typename BufferType, size_t I>
+struct shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl<TupleType, BufferType, I, I> {
+  void run(BufferType& buffer, rmm::cuda_stream_view stream_view) {}
+};
+
+template <typename TupleType, size_t I, typename BufferType>
+auto get_dataframe_buffer_begin_tuple_element_impl(BufferType& buffer)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return std::get<I>(buffer).begin();
+}
+
+template <typename TupleType, size_t... Is, typename BufferType>
+auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
+{
+  // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
+  return thrust::make_tuple(
+    get_dataframe_buffer_begin_tuple_element_impl<TupleType, Is>(buffer)...);
+}
+
+template <typename TupleType, size_t I, typename BufferType>
+auto get_dataframe_buffer_end_tuple_element_impl(BufferType& buffer)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return std::get<I>(buffer).end();
+}
+
+template <typename TupleType, size_t... Is, typename BufferType>
+auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
+{
+  // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
+  return thrust::make_tuple(get_dataframe_buffer_end_tuple_element_impl<TupleType, Is>(buffer)...);
+}
+
+}  // namespace detail
+
+template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_view)
+{
+  return rmm::device_uvector<T>(buffer_size, stream_view);
+}
+
+template <typename T, typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_view)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return detail::allocate_dataframe_buffer_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer_size, stream_view);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+void resize_dataframe_buffer(BufferType& buffer,
+                             size_t new_buffer_size,
+                             rmm::cuda_stream_view stream_view)
+{
+  buffer.resize(new_buffer_size, stream_view);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+void resize_dataframe_buffer(BufferType& buffer,
+                             size_t new_buffer_size,
+                             rmm::cuda_stream_view stream_view)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  detail::
+    resize_dataframe_buffer_tuple_iterator_element_impl<T, BufferType, size_t{0}, tuple_size>()
+      .run(buffer, new_buffer_size, stream_view);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+void shrink_to_fit_dataframe_buffer(BufferType& buffer, rmm::cuda_stream_view stream_view)
+{
+  buffer.shrink_to_fit(stream_view);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+void shrink_to_fit_dataframe_buffer(BufferType& buffer, rmm::cuda_stream_view stream_view)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  detail::shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl<T,
+                                                                     BufferType,
+                                                                     size_t{0},
+                                                                     tuple_size>()
+    .run(buffer, stream_view);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+size_t size_dataframe_buffer(BufferType& buffer)
+{
+  return buffer.size();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+size_t size_dataframe_buffer(BufferType& buffer)
+{
+  return std::get<0>(buffer).size();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_begin(BufferType& buffer)
+{
+  return buffer.begin();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_begin(BufferType& buffer)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return thrust::make_zip_iterator(detail::get_dataframe_buffer_begin_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer));
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_end(BufferType& buffer)
+{
+  return buffer.end();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_end(BufferType& buffer)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return thrust::make_zip_iterator(
+    detail::get_dataframe_buffer_end_tuple_impl<T>(std::make_index_sequence<tuple_size>(), buffer));
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/utilities/comm_utils.cuh b/cpp/include/cugraph/utilities/device_comm.cuh
similarity index 68%
rename from cpp/include/utilities/comm_utils.cuh
rename to cpp/include/cugraph/utilities/device_comm.cuh
index fb69fff49c9..59a01957272 100644
--- a/cpp/include/utilities/comm_utils.cuh
+++ b/cpp/include/cugraph/utilities/device_comm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,16 +15,16 @@
  */
 #pragma once
 
-#include <utilities/thrust_tuple_utils.cuh>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
 
 #include <raft/handle.hpp>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/detail/type_traits/iterator/is_discard_iterator.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/detail/normal_iterator.h>
 
-#include <numeric>
 #include <type_traits>
 
 namespace cugraph {
@@ -32,66 +32,6 @@ namespace experimental {
 
 namespace detail {
 
-template <typename TupleType, size_t I, size_t N>
-struct update_vector_of_tuple_scalar_elements_from_tuple_impl {
-  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const
-  {
-    using element_t = typename thrust::tuple_element<I, TupleType>::type;
-    static_assert(sizeof(element_t) <= sizeof(int64_t));
-    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
-    *ptr     = thrust::get<I>(tuple);
-    update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I + 1, N>().update(
-      tuple_scalar_elements, tuple);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I, I> {
-  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const { return; }
-};
-
-template <typename TupleType, size_t I, size_t N>
-struct update_tuple_from_vector_of_tuple_scalar_elements_impl {
-  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const
-  {
-    using element_t = typename thrust::tuple_element<I, TupleType>::type;
-    static_assert(sizeof(element_t) <= sizeof(int64_t));
-    auto ptr              = reinterpret_cast<element_t const*>(tuple_scalar_elements.data() + I);
-    thrust::get<I>(tuple) = *ptr;
-    update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I + 1, N>().update(
-      tuple, tuple_scalar_elements);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I, I> {
-  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const { return; }
-};
-
-template <typename TupleType, size_t I, size_t N>
-struct host_allreduce_tuple_scalar_element_impl {
-  void run(raft::comms::comms_t const& comm,
-           rmm::device_uvector<int64_t>& tuple_scalar_elements,
-           cudaStream_t stream) const
-  {
-    using element_t = typename thrust::tuple_element<I, TupleType>::type;
-    static_assert(sizeof(element_t) <= sizeof(int64_t));
-    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
-    comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream);
-    host_allreduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
-      comm, tuple_scalar_elements, stream);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct host_allreduce_tuple_scalar_element_impl<TupleType, I, I> {
-  void run(raft::comms::comms_t const& comm,
-           rmm::device_uvector<int64_t>& tuple_scalar_elements,
-           cudaStream_t stream) const
-  {
-  }
-};
-
 template <typename T>
 T* iter_to_raw_ptr(T* ptr)
 {
@@ -236,7 +176,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm,
                      OutputIterator output_first,
                      size_t rx_count,
                      int src,
-                     cudaStream_t stream)
+                     rmm::cuda_stream_view stream_view)
 {
   // no-op
 }
@@ -252,26 +192,18 @@ device_sendrecv_impl(raft::comms::comms_t const& comm,
                      OutputIterator output_first,
                      size_t rx_count,
                      int src,
-                     cudaStream_t stream)
+                     rmm::cuda_stream_view stream_view)
 {
   using value_type = typename std::iterator_traits<InputIterator>::value_type;
   static_assert(
     std::is_same<typename std::iterator_traits<OutputIterator>::value_type, value_type>::value);
-  // ncclSend/ncclRecv pair needs to be located inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-  ncclGroupStart();
-  ncclSend(iter_to_raw_ptr(input_first),
-           tx_count * sizeof(value_type),
-           ncclUint8,
-           dst,
-           comm.get_nccl_comm(),
-           stream);
-  ncclRecv(iter_to_raw_ptr(output_first),
-           rx_count * sizeof(value_type),
-           ncclUint8,
-           src,
-           comm.get_nccl_comm(),
-           stream);
-  ncclGroupEnd();
+  comm.device_sendrecv(iter_to_raw_ptr(input_first),
+                       tx_count,
+                       dst,
+                       iter_to_raw_ptr(output_first),
+                       rx_count,
+                       src,
+                       stream_view.value());
 }
 
 template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
@@ -283,7 +215,7 @@ struct device_sendrecv_tuple_iterator_element_impl {
            OutputIterator output_first,
            size_t rx_count,
            int src,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
     using output_value_t = typename thrust::
       tuple_element<I, typename std::iterator_traits<OutputIterator>::value_type>::type;
@@ -297,9 +229,9 @@ struct device_sendrecv_tuple_iterator_element_impl {
       tuple_element_output_first,
       rx_count,
       src,
-      stream);
+      stream_view.value());
     device_sendrecv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
-      comm, input_first, tx_count, dst, output_first, rx_count, src, stream);
+      comm, input_first, tx_count, dst, output_first, rx_count, src, stream_view);
   }
 };
 
@@ -307,10 +239,12 @@ template <typename InputIterator, typename OutputIterator, size_t I>
 struct device_sendrecv_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
   void run(raft::comms::comms_t const& comm,
            InputIterator input_first,
-           size_t count,
+           size_t tx_count,
            int dst,
-           int base_tag,
-           raft::comms::request_t* requests) const
+           OutputIterator output_first,
+           size_t rx_count,
+           int src,
+           rmm::cuda_stream_view stream_view) const
   {
   }
 };
@@ -326,7 +260,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm,
                                std::vector<size_t> const& rx_counts,
                                std::vector<size_t> const& rx_offsets,
                                std::vector<int> const& rx_src_ranks,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream_view)
 {
   // no-op
 }
@@ -344,30 +278,20 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm,
                                std::vector<size_t> const& rx_counts,
                                std::vector<size_t> const& rx_offsets,
                                std::vector<int> const& rx_src_ranks,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream_view)
 {
   using value_type = typename std::iterator_traits<InputIterator>::value_type;
   static_assert(
     std::is_same<typename std::iterator_traits<OutputIterator>::value_type, value_type>::value);
-  // ncclSend/ncclRecv pair needs to be located inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-  ncclGroupStart();
-  for (size_t i = 0; i < tx_counts.size(); ++i) {
-    ncclSend(iter_to_raw_ptr(input_first + tx_offsets[i]),
-             tx_counts[i] * sizeof(value_type),
-             ncclUint8,
-             tx_dst_ranks[i],
-             comm.get_nccl_comm(),
-             stream);
-  }
-  for (size_t i = 0; i < rx_counts.size(); ++i) {
-    ncclRecv(iter_to_raw_ptr(output_first + rx_offsets[i]),
-             rx_counts[i] * sizeof(value_type),
-             ncclUint8,
-             rx_src_ranks[i],
-             comm.get_nccl_comm(),
-             stream);
-  }
-  ncclGroupEnd();
+  comm.device_multicast_sendrecv(iter_to_raw_ptr(input_first),
+                                 tx_counts,
+                                 tx_offsets,
+                                 tx_dst_ranks,
+                                 iter_to_raw_ptr(output_first),
+                                 rx_counts,
+                                 rx_offsets,
+                                 rx_src_ranks,
+                                 stream_view.value());
 }
 
 template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
@@ -381,7 +305,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl {
            std::vector<size_t> const& rx_counts,
            std::vector<size_t> const& rx_offsets,
            std::vector<int> const& rx_src_ranks,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
     using output_value_t = typename thrust::
       tuple_element<I, typename std::iterator_traits<OutputIterator>::value_type>::type;
@@ -397,7 +321,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl {
                                                                          rx_counts,
                                                                          rx_offsets,
                                                                          rx_src_ranks,
-                                                                         stream);
+                                                                         stream_view);
     device_multicast_sendrecv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>()
       .run(comm,
            input_first,
@@ -408,7 +332,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl {
            rx_counts,
            rx_offsets,
            rx_src_ranks,
-           stream);
+           stream_view);
   }
 };
 
@@ -423,7 +347,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl<InputIterator, Outp
            std::vector<size_t> const& rx_counts,
            std::vector<size_t> const& rx_offsets,
            std::vector<int> const& rx_src_ranks,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
   }
 };
@@ -435,7 +359,7 @@ device_bcast_impl(raft::comms::comms_t const& comm,
                   OutputIterator output_first,
                   size_t count,
                   int root,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream_view)
 {
   // no-op
 }
@@ -449,14 +373,14 @@ device_bcast_impl(raft::comms::comms_t const& comm,
                   OutputIterator output_first,
                   size_t count,
                   int root,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream_view)
 {
   static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
                              typename std::iterator_traits<OutputIterator>::value_type>::value);
   if (comm.get_rank() == root) {
-    comm.bcast(iter_to_raw_ptr(input_first), count, root, stream);
+    comm.bcast(iter_to_raw_ptr(input_first), count, root, stream_view.value());
   } else {
-    comm.bcast(iter_to_raw_ptr(output_first), count, root, stream);
+    comm.bcast(iter_to_raw_ptr(output_first), count, root, stream_view.value());
   }
 }
 
@@ -467,16 +391,16 @@ struct device_bcast_tuple_iterator_element_impl {
            OutputIterator output_first,
            size_t count,
            int root,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
     device_bcast_impl(comm,
                       thrust::get<I>(input_first.get_iterator_tuple()),
                       thrust::get<I>(output_first.get_iterator_tuple()),
                       count,
                       root,
-                      stream);
-    device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>(
-      comm, input_first, output_first, count, root, stream);
+                      stream_view);
+    device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, count, root, stream_view);
   }
 };
 
@@ -487,7 +411,68 @@ struct device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, I
            OutputIterator output_first,
            size_t count,
            int root,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
+  {
+  }
+};
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_allreduce_impl(raft::comms::comms_t const& comm,
+                      InputIterator input_first,
+                      OutputIterator output_first,
+                      size_t count,
+                      raft::comms::op_t op,
+                      rmm::cuda_stream_view stream_view)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allreduce_impl(raft::comms::comms_t const& comm,
+                      InputIterator input_first,
+                      OutputIterator output_first,
+                      size_t count,
+                      raft::comms::op_t op,
+                      rmm::cuda_stream_view stream_view)
+{
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  comm.allreduce(
+    iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value());
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_allreduce_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           raft::comms::op_t op,
+           rmm::cuda_stream_view stream_view) const
+  {
+    device_allreduce_impl(comm,
+                          thrust::get<I>(input_first.get_iterator_tuple()),
+                          thrust::get<I>(output_first.get_iterator_tuple()),
+                          count,
+                          op,
+                          stream_view);
+    device_allreduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, count, op, stream_view);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_allreduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           raft::comms::op_t op,
+           rmm::cuda_stream_view stream_view) const
   {
   }
 };
@@ -500,7 +485,7 @@ device_reduce_impl(raft::comms::comms_t const& comm,
                    size_t count,
                    raft::comms::op_t op,
                    int root,
-                   cudaStream_t stream)
+                   rmm::cuda_stream_view stream_view)
 {
   // no-op
 }
@@ -515,11 +500,16 @@ device_reduce_impl(raft::comms::comms_t const& comm,
                    size_t count,
                    raft::comms::op_t op,
                    int root,
-                   cudaStream_t stream)
+                   rmm::cuda_stream_view stream_view)
 {
   static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
                              typename std::iterator_traits<OutputIterator>::value_type>::value);
-  comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, root, stream);
+  comm.reduce(iter_to_raw_ptr(input_first),
+              iter_to_raw_ptr(output_first),
+              count,
+              op,
+              root,
+              stream_view.value());
 }
 
 template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
@@ -530,7 +520,7 @@ struct device_reduce_tuple_iterator_element_impl {
            size_t count,
            raft::comms::op_t op,
            int root,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
     device_reduce_impl(comm,
                        thrust::get<I>(input_first.get_iterator_tuple()),
@@ -538,9 +528,9 @@ struct device_reduce_tuple_iterator_element_impl {
                        count,
                        op,
                        root,
-                       stream);
-    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>(
-      comm, input_first, output_first, count, op, root, stream);
+                       stream_view);
+    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, count, op, root, stream_view);
   }
 };
 
@@ -552,7 +542,7 @@ struct device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator,
            size_t count,
            raft::comms::op_t op,
            int root,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
   }
 };
@@ -564,7 +554,7 @@ device_allgatherv_impl(raft::comms::comms_t const& comm,
                        OutputIterator output_first,
                        std::vector<size_t> const& recvcounts,
                        std::vector<size_t> const& displacements,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream_view)
 {
   // no-op
 }
@@ -578,7 +568,7 @@ device_allgatherv_impl(raft::comms::comms_t const& comm,
                        OutputIterator output_first,
                        std::vector<size_t> const& recvcounts,
                        std::vector<size_t> const& displacements,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream_view)
 {
   static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
                              typename std::iterator_traits<OutputIterator>::value_type>::value);
@@ -586,7 +576,7 @@ device_allgatherv_impl(raft::comms::comms_t const& comm,
                   iter_to_raw_ptr(output_first),
                   recvcounts.data(),
                   displacements.data(),
-                  stream);
+                  stream_view.value());
 }
 
 template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
@@ -596,16 +586,16 @@ struct device_allgatherv_tuple_iterator_element_impl {
            OutputIterator output_first,
            std::vector<size_t> const& recvcounts,
            std::vector<size_t> const& displacements,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
     device_allgatherv_impl(comm,
                            thrust::get<I>(input_first.get_iterator_tuple()),
                            thrust::get<I>(output_first.get_iterator_tuple()),
                            recvcounts,
                            displacements,
-                           stream);
+                           stream_view);
     device_allgatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
-      comm, input_first, output_first, recvcounts, displacements, stream);
+      comm, input_first, output_first, recvcounts, displacements, stream_view);
   }
 };
 
@@ -616,188 +606,88 @@ struct device_allgatherv_tuple_iterator_element_impl<InputIterator, OutputIterat
            OutputIterator output_first,
            std::vector<size_t> const& recvcounts,
            std::vector<size_t> const& displacements,
-           cudaStream_t stream) const
+           rmm::cuda_stream_view stream_view) const
   {
   }
 };
 
-template <typename TupleType, size_t I>
-auto allocate_comm_buffer_tuple_element_impl(size_t buffer_size, cudaStream_t stream)
-{
-  using element_t = typename thrust::tuple_element<I, TupleType>::type;
-  return rmm::device_uvector<element_t>(buffer_size, stream);
-}
-
-template <typename TupleType, size_t... Is>
-auto allocate_comm_buffer_tuple_impl(std::index_sequence<Is...>,
-                                     size_t buffer_size,
-                                     cudaStream_t stream)
-{
-  return thrust::make_tuple(
-    allocate_comm_buffer_tuple_element_impl<TupleType, Is>(buffer_size, stream)...);
-}
-
-template <typename TupleType, size_t I, typename BufferType>
-auto get_comm_buffer_begin_tuple_element_impl(BufferType& buffer)
-{
-  using element_t = typename thrust::tuple_element<I, TupleType>::type;
-  return thrust::get<I>(buffer).begin();
-}
-
-template <typename TupleType, size_t... Is, typename BufferType>
-auto get_comm_buffer_begin_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
-{
-  return thrust::make_tuple(get_comm_buffer_begin_tuple_element_impl<TupleType, Is>(buffer)...);
-}
-
-}  // namespace detail
-
-template <typename T>
-std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
-  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
-{
-  rmm::device_uvector<T> d_input(1, stream);
-  raft::update_device(d_input.data(), &input, 1, stream);
-  comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
-  T h_input{};
-  raft::update_host(&h_input, d_input.data(), 1, stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  return h_input;
-}
-
-template <typename T>
-std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
-host_scalar_allreduce(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_gatherv_impl(raft::comms::comms_t const& comm,
+                    InputIterator input_first,
+                    OutputIterator output_first,
+                    size_t sendcount,
+                    std::vector<size_t> const& recvcounts,
+                    std::vector<size_t> const& displacements,
+                    int root,
+                    rmm::cuda_stream_view stream_view)
 {
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
-  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
-  T ret{};
-
-  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
-    h_tuple_scalar_elements, input);
-  raft::update_device(
-    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
-  detail::host_allreduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
-    comm, d_tuple_scalar_elements, stream);
-  raft::update_host(
-    h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>().update(
-    ret, h_tuple_scalar_elements);
-
-  return ret;
+  // no-op
 }
 
-template <typename T>
-std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_bcast(
-  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_gatherv_impl(raft::comms::comms_t const& comm,
+                    InputIterator input_first,
+                    OutputIterator output_first,
+                    size_t sendcount,
+                    std::vector<size_t> const& recvcounts,
+                    std::vector<size_t> const& displacements,
+                    int root,
+                    rmm::cuda_stream_view stream_view)
 {
-  rmm::device_uvector<T> d_input(1, stream);
-  if (comm.get_rank() == root) { raft::update_device(d_input.data(), &input, 1, stream); }
-  comm.bcast(d_input.data(), 1, root, stream);
-  auto h_input = input;
-  if (comm.get_rank() != root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  return h_input;
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  comm.gatherv(iter_to_raw_ptr(input_first),
+               iter_to_raw_ptr(output_first),
+               sendcount,
+               recvcounts.data(),
+               displacements.data(),
+               root,
+               stream_view.value());
 }
 
-template <typename T>
-std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
-host_scalar_bcast(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
-  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
-  auto ret = input;
-
-  if (comm.get_rank() == root) {
-    detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>()
-      .update(h_tuple_scalar_elements, input);
-    raft::update_device(
-      d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
-  }
-  comm.bcast(d_tuple_scalar_elements.data(), d_tuple_scalar_elements.size(), root, stream);
-  if (comm.get_rank() != root) {
-    raft::update_host(
-      h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
-  }
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  if (comm.get_rank() != root) {
-    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
-      .update(ret, h_tuple_scalar_elements);
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_gatherv_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t sendcount,
+           std::vector<size_t> const& recvcounts,
+           std::vector<size_t> const& displacements,
+           int root,
+           rmm::cuda_stream_view stream_view) const
+  {
+    device_gatherv_impl(comm,
+                        thrust::get<I>(input_first.get_iterator_tuple()),
+                        thrust::get<I>(output_first.get_iterator_tuple()),
+                        sendcount,
+                        recvcounts,
+                        displacements,
+                        root,
+                        stream_view);
+    device_gatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view);
   }
+};
 
-  return ret;
-}
-
-template <typename T>
-std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_allgather(
-  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
-{
-  std::vector<size_t> rx_counts(comm.get_size(), size_t{1});
-  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
-  std::iota(displacements.begin(), displacements.end(), size_t{0});
-  rmm::device_uvector<T> d_outputs(rx_counts.size(), stream);
-  raft::update_device(d_outputs.data() + comm.get_rank(), &input, 1, stream);
-  comm.allgatherv(d_outputs.data() + comm.get_rank(),
-                  d_outputs.data(),
-                  rx_counts.data(),
-                  displacements.data(),
-                  stream);
-  std::vector<T> h_outputs(rx_counts.size(), size_t{0});
-  raft::update_host(h_outputs.data(), d_outputs.data(), rx_counts.size(), stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  return h_outputs;
-}
-
-template <typename T>
-std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
-host_scalar_allgather(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  std::vector<size_t> rx_counts(comm.get_size(), tuple_size);
-  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
-  for (size_t i = 0; i < displacements.size(); ++i) { displacements[i] = i * tuple_size; }
-  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
-  rmm::device_uvector<int64_t> d_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size,
-                                                                   stream);
-
-  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
-    h_tuple_scalar_elements, input);
-  raft::update_device(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
-                      h_tuple_scalar_elements.data(),
-                      tuple_size,
-                      stream);
-  comm.allgatherv(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
-                  d_allgathered_tuple_scalar_elements.data(),
-                  rx_counts.data(),
-                  displacements.data(),
-                  stream);
-  std::vector<int64_t> h_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size);
-  raft::update_host(h_allgathered_tuple_scalar_elements.data(),
-                    d_allgathered_tuple_scalar_elements.data(),
-                    comm.get_size() * tuple_size,
-                    stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-
-  std::vector<T> ret(comm.get_size());
-  for (size_t i = 0; i < ret.size(); ++i) {
-    std::vector<int64_t> h_tuple_scalar_elements(
-      h_allgathered_tuple_scalar_elements.data() + i * tuple_size,
-      h_allgathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
-    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
-      .update(ret[i], h_tuple_scalar_elements);
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_gatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t sendcount,
+           std::vector<size_t> const& recvcounts,
+           std::vector<size_t> const& displacements,
+           int root,
+           rmm::cuda_stream_view stream_view) const
+  {
   }
+};
 
-  return ret;
-}
+}  // namespace detail
 
 template <typename InputIterator, typename OutputIterator>
 std::enable_if_t<
@@ -889,10 +779,10 @@ device_sendrecv(raft::comms::comms_t const& comm,
                 OutputIterator output_first,
                 size_t rx_count,
                 int src,
-                cudaStream_t stream)
+                rmm::cuda_stream_view stream_view)
 {
   detail::device_sendrecv_impl<InputIterator, OutputIterator>(
-    comm, input_first, tx_count, dst, output_first, rx_count, src, stream);
+    comm, input_first, tx_count, dst, output_first, rx_count, src, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -907,7 +797,7 @@ device_sendrecv(raft::comms::comms_t const& comm,
                 OutputIterator output_first,
                 size_t rx_count,
                 int src,
-                cudaStream_t stream)
+                rmm::cuda_stream_view stream_view)
 {
   static_assert(
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
@@ -923,7 +813,7 @@ device_sendrecv(raft::comms::comms_t const& comm,
                                                       OutputIterator,
                                                       size_t{0},
                                                       tuple_size>()
-    .run(comm, input_first, tx_count, dst, output_first, rx_count, src, stream);
+    .run(comm, input_first, tx_count, dst, output_first, rx_count, src, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -939,7 +829,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm,
                           std::vector<size_t> const& rx_counts,
                           std::vector<size_t> const& rx_offsets,
                           std::vector<int> const& rx_src_ranks,
-                          cudaStream_t stream)
+                          rmm::cuda_stream_view stream_view)
 {
   detail::device_multicast_sendrecv_impl<InputIterator, OutputIterator>(comm,
                                                                         input_first,
@@ -950,7 +840,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm,
                                                                         rx_counts,
                                                                         rx_offsets,
                                                                         rx_src_ranks,
-                                                                        stream);
+                                                                        stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -967,7 +857,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm,
                           std::vector<size_t> const& rx_counts,
                           std::vector<size_t> const& rx_offsets,
                           std::vector<int> const& rx_src_ranks,
-                          cudaStream_t stream)
+                          rmm::cuda_stream_view stream_view)
 {
   static_assert(
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
@@ -992,7 +882,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm,
          rx_counts,
          rx_offsets,
          rx_src_ranks,
-         stream);
+         stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -1004,9 +894,9 @@ device_bcast(raft::comms::comms_t const& comm,
              OutputIterator output_first,
              size_t count,
              int root,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream_view)
 {
-  detail::device_bcast_impl(comm, input_first, output_first, count, root, stream);
+  detail::device_bcast_impl(comm, input_first, output_first, count, root, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -1019,7 +909,7 @@ device_bcast(raft::comms::comms_t const& comm,
              OutputIterator output_first,
              size_t count,
              int root,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream_view)
 {
   static_assert(
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
@@ -1029,8 +919,48 @@ device_bcast(raft::comms::comms_t const& comm,
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
 
   detail::
-    device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>(
-      comm, input_first, output_first, count, root, stream);
+    device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>()
+      .run(comm, input_first, output_first, count, root, stream_view);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_allreduce(raft::comms::comms_t const& comm,
+                 InputIterator input_first,
+                 OutputIterator output_first,
+                 size_t count,
+                 raft::comms::op_t op,
+                 rmm::cuda_stream_view stream_view)
+{
+  detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream_view);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allreduce(raft::comms::comms_t const& comm,
+                 InputIterator input_first,
+                 OutputIterator output_first,
+                 size_t count,
+                 raft::comms::op_t op,
+                 rmm::cuda_stream_view stream_view)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::device_allreduce_tuple_iterator_element_impl<InputIterator,
+                                                       OutputIterator,
+                                                       size_t{0},
+                                                       tuple_size>()
+    .run(comm, input_first, output_first, count, op, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -1043,9 +973,9 @@ device_reduce(raft::comms::comms_t const& comm,
               size_t count,
               raft::comms::op_t op,
               int root,
-              cudaStream_t stream)
+              rmm::cuda_stream_view stream_view)
 {
-  detail::device_reduce_impl(comm, input_first, output_first, count, op, root, stream);
+  detail::device_reduce_impl(comm, input_first, output_first, count, op, root, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -1059,7 +989,7 @@ device_reduce(raft::comms::comms_t const& comm,
               size_t count,
               raft::comms::op_t op,
               int root,
-              cudaStream_t stream)
+              rmm::cuda_stream_view stream_view)
 {
   static_assert(
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
@@ -1068,9 +998,11 @@ device_reduce(raft::comms::comms_t const& comm,
   size_t constexpr tuple_size =
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
 
-  detail::
-    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>(
-      comm, input_first, output_first, count, op, root, stream);
+  detail::device_reduce_tuple_iterator_element_impl<InputIterator,
+                                                    OutputIterator,
+                                                    size_t{0},
+                                                    tuple_size>()
+    .run(comm, input_first, output_first, count, op, root, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -1082,10 +1014,10 @@ device_allgatherv(raft::comms::comms_t const& comm,
                   OutputIterator output_first,
                   std::vector<size_t> const& recvcounts,
                   std::vector<size_t> const& displacements,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream_view)
 {
   detail::device_allgatherv_impl(
-    comm, input_first, output_first, recvcounts, displacements, stream);
+    comm, input_first, output_first, recvcounts, displacements, stream_view);
 }
 
 template <typename InputIterator, typename OutputIterator>
@@ -1098,7 +1030,7 @@ device_allgatherv(raft::comms::comms_t const& comm,
                   OutputIterator output_first,
                   std::vector<size_t> const& recvcounts,
                   std::vector<size_t> const& displacements,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream_view)
 {
   static_assert(
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
@@ -1111,39 +1043,52 @@ device_allgatherv(raft::comms::comms_t const& comm,
                                                         OutputIterator,
                                                         size_t{0},
                                                         tuple_size>()
-    .run(comm, input_first, output_first, recvcounts, displacements, stream);
+    .run(comm, input_first, output_first, recvcounts, displacements, stream_view);
 }
 
-template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-auto allocate_comm_buffer(size_t buffer_size, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_gatherv(raft::comms::comms_t const& comm,
+               InputIterator input_first,
+               OutputIterator output_first,
+               size_t sendcount,
+               std::vector<size_t> const& recvcounts,
+               std::vector<size_t> const& displacements,
+               int root,
+               rmm::cuda_stream_view stream_view)
 {
-  return rmm::device_uvector<T>(buffer_size, stream);
+  detail::device_gatherv_impl(
+    comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view);
 }
 
-template <typename T, typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
-auto allocate_comm_buffer(size_t buffer_size, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_gatherv(raft::comms::comms_t const& comm,
+               InputIterator input_first,
+               OutputIterator output_first,
+               size_t sendcount,
+               std::vector<size_t> const& recvcounts,
+               std::vector<size_t> const& displacements,
+               int root,
+               rmm::cuda_stream_view stream_view)
 {
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  return detail::allocate_comm_buffer_tuple_impl<T>(
-    std::make_index_sequence<tuple_size>(), buffer_size, stream);
-}
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
 
-template <typename T,
-          typename BufferType,
-          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-auto get_comm_buffer_begin(BufferType& buffer)
-{
-  return buffer.begin();
-}
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
 
-template <typename T,
-          typename BufferType,
-          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
-auto get_comm_buffer_begin(BufferType& buffer)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  return thrust::make_zip_iterator(
-    detail::get_comm_buffer_begin_tuple_impl<T>(std::make_index_sequence<tuple_size>(), buffer));
+  detail::device_gatherv_tuple_iterator_element_impl<InputIterator,
+                                                     OutputIterator,
+                                                     size_t{0},
+                                                     tuple_size>()
+    .run(comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view);
 }
 
 }  // namespace experimental
diff --git a/cpp/include/utilities/error.hpp b/cpp/include/cugraph/utilities/error.hpp
similarity index 98%
rename from cpp/include/utilities/error.hpp
rename to cpp/include/cugraph/utilities/error.hpp
index e44e2c910ea..8cfb077cf7b 100644
--- a/cpp/include/utilities/error.hpp
+++ b/cpp/include/cugraph/utilities/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cugraph/utilities/graph_traits.hpp b/cpp/include/cugraph/utilities/graph_traits.hpp
new file mode 100644
index 00000000000..363a13190be
--- /dev/null
+++ b/cpp/include/cugraph/utilities/graph_traits.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+// primary template:
+//
+template <typename Src, typename... Types>
+struct is_one_of;  // purposely empty
+
+// partial specializations:
+//
+template <typename Src, typename Head, typename... Tail>
+struct is_one_of<Src, Head, Tail...> {
+  static constexpr bool value = std::is_same<Src, Head>::value || is_one_of<Src, Tail...>::value;
+};
+
+template <typename Src>
+struct is_one_of<Src> {
+  static constexpr bool value = false;
+};
+
+// meta-function that constrains
+// vertex_t and edge_t template param candidates:
+//
+template <typename vertex_t, typename edge_t>
+struct is_vertex_edge_combo {
+  static constexpr bool value = is_one_of<vertex_t, int32_t, int64_t>::value &&
+                                is_one_of<edge_t, int32_t, int64_t>::value &&
+                                (sizeof(vertex_t) <= sizeof(edge_t));
+};
+
+// meta-function that constrains
+// all 3 template param candidates:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct is_candidate {
+  static constexpr bool value =
+    is_vertex_edge_combo<vertex_t, edge_t>::value && is_one_of<weight_t, float, double>::value;
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/host_barrier.hpp b/cpp/include/cugraph/utilities/host_barrier.hpp
new file mode 100644
index 00000000000..11803a7bde4
--- /dev/null
+++ b/cpp/include/cugraph/utilities/host_barrier.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cugraph {
+namespace experimental {
+
+// FIXME: a temporary hack till UCC is integrated into RAFT (so we can use UCC barrier for DASK and
+// MPI barrier for MPI)
+void host_barrier(raft::comms::comms_t const& comm, rmm::cuda_stream_view stream_view);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/host_scalar_comm.cuh b/cpp/include/cugraph/utilities/host_scalar_comm.cuh
new file mode 100644
index 00000000000..26994ebde14
--- /dev/null
+++ b/cpp/include/cugraph/utilities/host_scalar_comm.cuh
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <numeric>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename TupleType, size_t I, size_t N>
+struct update_vector_of_tuple_scalar_elements_from_tuple_impl {
+  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    *ptr     = thrust::get<I>(tuple);
+    update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I + 1, N>().update(
+      tuple_scalar_elements, tuple);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I, I> {
+  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const { return; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct update_tuple_from_vector_of_tuple_scalar_elements_impl {
+  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr              = reinterpret_cast<element_t const*>(tuple_scalar_elements.data() + I);
+    thrust::get<I>(tuple) = *ptr;
+    update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I + 1, N>().update(
+      tuple, tuple_scalar_elements);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I, I> {
+  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const { return; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct host_allreduce_tuple_scalar_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           cudaStream_t stream) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream);
+    host_allreduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
+      comm, tuple_scalar_elements, stream);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct host_allreduce_tuple_scalar_element_impl<TupleType, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct host_reduce_tuple_scalar_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           int root,
+           cudaStream_t stream) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    comm.reduce(ptr, ptr, 1, raft::comms::op_t::SUM, root, stream);
+    host_reduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
+      comm, tuple_scalar_elements, root, stream);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct host_reduce_tuple_scalar_element_impl<TupleType, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           int root,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+}  // namespace detail
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
+  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  raft::update_device(d_input.data(), &input, 1, stream);
+  comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  T h_input{};
+  raft::update_host(&h_input, d_input.data(), 1, stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_allreduce(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  T ret{};
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(
+    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  detail::host_allreduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
+    comm, d_tuple_scalar_elements, stream);
+  raft::update_host(
+    h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>().update(
+    ret, h_tuple_scalar_elements);
+
+  return ret;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_reduce(
+  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  raft::update_device(d_input.data(), &input, 1, stream);
+  comm.reduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  T h_input{};
+  if (comm.get_rank() == root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_reduce(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  T ret{};
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(
+    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  detail::host_reduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
+    comm, d_tuple_scalar_elements, root, stream);
+  if (comm.get_rank() == root) {
+    raft::update_host(
+      h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  if (comm.get_rank() == root) {
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret, h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_bcast(
+  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  if (comm.get_rank() == root) { raft::update_device(d_input.data(), &input, 1, stream); }
+  comm.bcast(d_input.data(), 1, root, stream);
+  auto h_input = input;
+  if (comm.get_rank() != root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_bcast(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  auto ret = input;
+
+  if (comm.get_rank() == root) {
+    detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>()
+      .update(h_tuple_scalar_elements, input);
+    raft::update_device(
+      d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  }
+  comm.bcast(d_tuple_scalar_elements.data(), d_tuple_scalar_elements.size(), root, stream);
+  if (comm.get_rank() != root) {
+    raft::update_host(
+      h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  if (comm.get_rank() != root) {
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret, h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_allgather(
+  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  std::vector<size_t> rx_counts(comm.get_size(), size_t{1});
+  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
+  std::iota(displacements.begin(), displacements.end(), size_t{0});
+  rmm::device_uvector<T> d_outputs(rx_counts.size(), stream);
+  raft::update_device(d_outputs.data() + comm.get_rank(), &input, 1, stream);
+  // FIXME: better use allgather
+  comm.allgatherv(d_outputs.data() + comm.get_rank(),
+                  d_outputs.data(),
+                  rx_counts.data(),
+                  displacements.data(),
+                  stream);
+  std::vector<T> h_outputs(rx_counts.size());
+  raft::update_host(h_outputs.data(), d_outputs.data(), rx_counts.size(), stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_outputs;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
+host_scalar_allgather(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<size_t> rx_counts(comm.get_size(), tuple_size);
+  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
+  for (size_t i = 0; i < displacements.size(); ++i) {
+    displacements[i] = i * tuple_size;
+  }
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size,
+                                                                   stream);
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
+                      h_tuple_scalar_elements.data(),
+                      tuple_size,
+                      stream);
+  // FIXME: better use allgather
+  comm.allgatherv(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
+                  d_allgathered_tuple_scalar_elements.data(),
+                  rx_counts.data(),
+                  displacements.data(),
+                  stream);
+  std::vector<int64_t> h_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size);
+  raft::update_host(h_allgathered_tuple_scalar_elements.data(),
+                    d_allgathered_tuple_scalar_elements.data(),
+                    comm.get_size() * tuple_size,
+                    stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+
+  std::vector<T> ret(comm.get_size());
+  for (size_t i = 0; i < ret.size(); ++i) {
+    std::vector<int64_t> h_tuple_scalar_elements(
+      h_allgathered_tuple_scalar_elements.data() + i * tuple_size,
+      h_allgathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret[i], h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_gather(
+  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_outputs(comm.get_rank() == root ? comm.get_size() : int{1}, stream);
+  raft::update_device(
+    comm.get_rank() == root ? d_outputs.data() + comm.get_rank() : d_outputs.data(),
+    &input,
+    1,
+    stream);
+  comm.gather(comm.get_rank() == root ? d_outputs.data() + comm.get_rank() : d_outputs.data(),
+              d_outputs.data(),
+              size_t{1},
+              root,
+              stream);
+  std::vector<T> h_outputs(comm.get_rank() == root ? comm.get_size() : 0);
+  if (comm.get_rank() == root) {
+    raft::update_host(h_outputs.data(), d_outputs.data(), comm.get_size(), stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_outputs;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
+host_scalar_gather(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_gathered_tuple_scalar_elements(
+    comm.get_rank() == root ? comm.get_size() * tuple_size : tuple_size, stream);
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(comm.get_rank() == root
+                        ? d_gathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size
+                        : d_gathered_tuple_scalar_elements.data(),
+                      h_tuple_scalar_elements.data(),
+                      tuple_size,
+                      stream);
+  comm.gather(comm.get_rank() == root
+                ? d_gathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size
+                : d_gathered_tuple_scalar_elements.data(),
+              d_gathered_tuple_scalar_elements.data(),
+              tuple_size,
+              root,
+              stream);
+  std::vector<int64_t> h_gathered_tuple_scalar_elements(
+    comm.get_rank() == root ? comm.get_size() * tuple_size : size_t{0});
+  if (comm.get_rank() == root) {
+    raft::update_host(h_gathered_tuple_scalar_elements.data(),
+                      d_gathered_tuple_scalar_elements.data(),
+                      comm.get_size() * tuple_size,
+                      stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+
+  std::vector<T> ret(comm.get_size());
+  if (comm.get_rank() == root) {
+    for (size_t i = 0; i < ret.size(); ++i) {
+      std::vector<int64_t> h_tuple_scalar_elements(
+        h_gathered_tuple_scalar_elements.data() + i * tuple_size,
+        h_gathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
+      detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+        .update(ret[i], h_tuple_scalar_elements);
+    }
+  }
+
+  return ret;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/path_retrieval.hpp b/cpp/include/cugraph/utilities/path_retrieval.hpp
new file mode 100644
index 00000000000..b4789c14c4b
--- /dev/null
+++ b/cpp/include/cugraph/utilities/path_retrieval.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+
+namespace cugraph {
+
+/**
+ * @brief Takes the results of BFS or SSSP function call and sums the given
+ * weights along the path to the starting vertex.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms. Must have at least one worker stream.
+ * @param vertices Pointer to vertex ids.
+ * @param preds Pointer to predecessors.
+ * @param info_weights Secondary weights along the edge from predecessor to vertex.
+ * @param out Contains for each index the sum of weights along the path unfolding.
+ * @param num_vertices Number of vertices.
+ **/
+template <typename vertex_t, typename weight_t>
+void get_traversed_cost(raft::handle_t const& handle,
+                        vertex_t const* vertices,
+                        vertex_t const* preds,
+                        weight_t const* info_weights,
+                        weight_t* out,
+                        vertex_t stop_vertex,
+                        vertex_t num_vertices);
+
+namespace experimental {
+/**
+ * @brief returns the COO format (src_vector, dst_vector) from the random walks (RW)
+ * paths.
+ *
+ * @tparam vertex_t Type of vertex indices.
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param coalesced_sz_v coalesced vertex vector size.
+ * @param num_paths number of paths.
+ * @param d_coalesced_v coalesced vertex buffer.
+ * @param d_sizes paths size buffer.
+ * @return tuple of (src_vertex_vector, dst_Vertex_vector, path_offsets), where
+ * path_offsets are the offsets where the COO set of each path starts.
+ */
+template <typename vertex_t, typename index_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<index_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       index_t coalesced_sz_v,
+                       index_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
+
+/**
+ * @brief returns additional RW information on vertex paths offsets and weight path sizes and
+ * offsets, for the coalesced case (the padded case does not need or provide this information)
+ *
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param num_paths number of paths.
+ * @param ptr_d_sizes sizes of vertex paths.
+ * @return tuple of (vertex_path_offsets, weight_path_sizes, weight_path_offsets), where offsets are
+ * exclusive scan of corresponding sizes.
+ */
+template <typename index_t>
+std::tuple<rmm::device_uvector<index_t>, rmm::device_uvector<index_t>, rmm::device_uvector<index_t>>
+query_rw_sizes_offsets(raft::handle_t const& handle, index_t num_paths, index_t const* ptr_d_sizes);
+}  // namespace experimental
+
+namespace broadcast {
+/**
+ * @brief broadcasts graph_t object (only the single GPU version).
+ *
+ * @tparam graph_t Type of graph (view).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_ptr pointer to graph object: not `nullptr` on send, `nullptr` (ignored) on receive.
+ * @return graph_t object that was sent/received
+ */
+template <typename graph_t>
+graph_t graph_broadcast(raft::handle_t const& handle, graph_t* graph_ptr);
+};  // namespace broadcast
+
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
new file mode 100644
index 00000000000..18752897a58
--- /dev/null
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/device_comm.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/distance.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// inline to suppress a complaint about ODR violation
+inline std::tuple<std::vector<size_t>,
+                  std::vector<size_t>,
+                  std::vector<int>,
+                  std::vector<size_t>,
+                  std::vector<size_t>,
+                  std::vector<int>>
+compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm,
+                                   rmm::device_uvector<size_t> const& d_tx_value_counts,
+                                   rmm::cuda_stream_view stream_view)
+{
+  auto const comm_size = comm.get_size();
+
+  rmm::device_uvector<size_t> d_rx_value_counts(comm_size, stream_view);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released.
+  std::vector<size_t> tx_counts(comm_size, size_t{1});
+  std::vector<size_t> tx_offsets(comm_size);
+  std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
+  std::vector<int> tx_dst_ranks(comm_size);
+  std::iota(tx_dst_ranks.begin(), tx_dst_ranks.end(), int{0});
+  std::vector<size_t> rx_counts(comm_size, size_t{1});
+  std::vector<size_t> rx_offsets(comm_size);
+  std::iota(rx_offsets.begin(), rx_offsets.end(), size_t{0});
+  std::vector<int> rx_src_ranks(comm_size);
+  std::iota(rx_src_ranks.begin(), rx_src_ranks.end(), int{0});
+  device_multicast_sendrecv(comm,
+                            d_tx_value_counts.data(),
+                            tx_counts,
+                            tx_offsets,
+                            tx_dst_ranks,
+                            d_rx_value_counts.data(),
+                            rx_counts,
+                            rx_offsets,
+                            rx_src_ranks,
+                            stream_view);
+
+  raft::update_host(tx_counts.data(), d_tx_value_counts.data(), comm_size, stream_view.value());
+  raft::update_host(rx_counts.data(), d_rx_value_counts.data(), comm_size, stream_view.value());
+
+  stream_view.synchronize();
+
+  std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1);
+  std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1);
+
+  int num_tx_dst_ranks{0};
+  int num_rx_src_ranks{0};
+  for (int i = 0; i < comm_size; ++i) {
+    if (tx_counts[i] != 0) {
+      tx_counts[num_tx_dst_ranks]    = tx_counts[i];
+      tx_offsets[num_tx_dst_ranks]   = tx_offsets[i];
+      tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i];
+      ++num_tx_dst_ranks;
+    }
+    if (rx_counts[i] != 0) {
+      rx_counts[num_rx_src_ranks]    = rx_counts[i];
+      rx_offsets[num_rx_src_ranks]   = rx_offsets[i];
+      rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i];
+      ++num_rx_src_ranks;
+    }
+  }
+  tx_counts.resize(num_tx_dst_ranks);
+  tx_offsets.resize(num_tx_dst_ranks);
+  tx_dst_ranks.resize(num_tx_dst_ranks);
+  rx_counts.resize(num_rx_src_ranks);
+  rx_offsets.resize(num_rx_src_ranks);
+  rx_src_ranks.resize(num_rx_src_ranks);
+
+  return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks);
+}
+
+}  // namespace detail
+
+template <typename ValueIterator, typename ValueToGPUIdOp>
+rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [INOUT */,
+                                              ValueIterator tx_value_last /* [INOUT */,
+                                              ValueToGPUIdOp value_to_group_id_op,
+                                              int num_groups,
+                                              rmm::cuda_stream_view stream_view)
+{
+  thrust::sort(rmm::exec_policy(stream_view),
+               tx_value_first,
+               tx_value_last,
+               [value_to_group_id_op] __device__(auto lhs, auto rhs) {
+                 return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
+               });
+
+  auto group_id_first = thrust::make_transform_iterator(
+    tx_value_first,
+    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+  rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
+  rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
+  auto last =
+    thrust::reduce_by_key(rmm::exec_policy(stream_view),
+                          group_id_first,
+                          group_id_first + thrust::distance(tx_value_first, tx_value_last),
+                          thrust::make_constant_iterator(size_t{1}),
+                          d_tx_dst_ranks.begin(),
+                          d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) {
+    rmm::device_uvector<size_t> d_counts(num_groups, stream_view);
+    thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream_view),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
+
+  return d_tx_value_counts;
+}
+
+template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
+rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [INOUT */,
+                                              VertexIterator tx_key_last /* [INOUT */,
+                                              ValueIterator tx_value_first /* [INOUT */,
+                                              KeyToGPUIdOp key_to_group_id_op,
+                                              int num_groups,
+                                              rmm::cuda_stream_view stream_view)
+{
+  thrust::sort_by_key(rmm::exec_policy(stream_view),
+                      tx_key_first,
+                      tx_key_last,
+                      tx_value_first,
+                      [key_to_group_id_op] __device__(auto lhs, auto rhs) {
+                        return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
+                      });
+
+  auto group_id_first = thrust::make_transform_iterator(
+    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+  rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
+  rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
+  auto last = thrust::reduce_by_key(rmm::exec_policy(stream_view),
+                                    group_id_first,
+                                    group_id_first + thrust::distance(tx_key_first, tx_key_last),
+                                    thrust::make_constant_iterator(size_t{1}),
+                                    d_tx_dst_ranks.begin(),
+                                    d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) {
+    rmm::device_uvector<size_t> d_counts(num_groups, stream_view);
+    thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream_view),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
+
+  return d_tx_value_counts;
+}
+
+template <typename TxValueIterator>
+auto shuffle_values(raft::comms::comms_t const& comm,
+                    TxValueIterator tx_value_first,
+                    std::vector<size_t> const& tx_value_counts,
+                    rmm::cuda_stream_view stream_view)
+{
+  auto const comm_size = comm.get_size();
+
+  rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream_view);
+  raft::update_device(
+    d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value());
+
+  std::vector<size_t> tx_counts{};
+  std::vector<size_t> tx_offsets{};
+  std::vector<int> tx_dst_ranks{};
+  std::vector<size_t> rx_counts{};
+  std::vector<size_t> rx_offsets{};
+  std::vector<int> rx_src_ranks{};
+  std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
+
+  auto rx_value_buffer =
+    allocate_dataframe_buffer<typename std::iterator_traits<TxValueIterator>::value_type>(
+      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(
+    comm,
+    tx_value_first,
+    tx_counts,
+    tx_offsets,
+    tx_dst_ranks,
+    get_dataframe_buffer_begin<typename std::iterator_traits<TxValueIterator>::value_type>(
+      rx_value_buffer),
+    rx_counts,
+    rx_offsets,
+    rx_src_ranks,
+    stream_view);
+
+  if (rx_counts.size() < static_cast<size_t>(comm_size)) {
+    std::vector<size_t> tmp_rx_counts(comm_size, size_t{0});
+    for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
+      assert(rx_src_ranks[i] < comm_size);
+      tmp_rx_counts[rx_src_ranks[i]] = rx_counts[i];
+    }
+    rx_counts = std::move(tmp_rx_counts);
+  }
+
+  return std::make_tuple(std::move(rx_value_buffer), rx_counts);
+}
+
+template <typename ValueIterator, typename ValueToGPUIdOp>
+auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
+                                      ValueIterator tx_value_first /* [INOUT */,
+                                      ValueIterator tx_value_last /* [INOUT */,
+                                      ValueToGPUIdOp value_to_gpu_id_op,
+                                      rmm::cuda_stream_view stream_view)
+{
+  auto const comm_size = comm.get_size();
+
+  auto d_tx_value_counts = groupby_and_count(
+    tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view);
+
+  std::vector<size_t> tx_counts{};
+  std::vector<size_t> tx_offsets{};
+  std::vector<int> tx_dst_ranks{};
+  std::vector<size_t> rx_counts{};
+  std::vector<size_t> rx_offsets{};
+  std::vector<int> rx_src_ranks{};
+  std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
+
+  auto rx_value_buffer =
+    allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(
+    comm,
+    tx_value_first,
+    tx_counts,
+    tx_offsets,
+    tx_dst_ranks,
+    get_dataframe_buffer_begin<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_value_buffer),
+    rx_counts,
+    rx_offsets,
+    rx_src_ranks,
+    stream_view);
+
+  if (rx_counts.size() < static_cast<size_t>(comm_size)) {
+    std::vector<size_t> tmp_rx_counts(comm_size, size_t{0});
+    for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
+      tmp_rx_counts[rx_src_ranks[i]] = rx_counts[i];
+    }
+    rx_counts = std::move(tmp_rx_counts);
+  }
+
+  return std::make_tuple(std::move(rx_value_buffer), rx_counts);
+}
+
+template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
+auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
+                                        VertexIterator tx_key_first /* [INOUT */,
+                                        VertexIterator tx_key_last /* [INOUT */,
+                                        ValueIterator tx_value_first /* [INOUT */,
+                                        KeyToGPUIdOp key_to_gpu_id_op,
+                                        rmm::cuda_stream_view stream_view)
+{
+  auto const comm_size = comm.get_size();
+
+  auto d_tx_value_counts = groupby_and_count(
+    tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream_view);
+
+  std::vector<size_t> tx_counts{};
+  std::vector<size_t> tx_offsets{};
+  std::vector<int> tx_dst_ranks{};
+  std::vector<size_t> rx_counts{};
+  std::vector<size_t> rx_offsets{};
+  std::vector<int> rx_src_ranks{};
+  std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
+
+  rmm::device_uvector<typename std::iterator_traits<VertexIterator>::value_type> rx_keys(
+    rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
+  auto rx_value_buffer =
+    allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_keys.size(), stream_view);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(comm,
+                            tx_key_first,
+                            tx_counts,
+                            tx_offsets,
+                            tx_dst_ranks,
+                            rx_keys.begin(),
+                            rx_counts,
+                            rx_offsets,
+                            rx_src_ranks,
+                            stream_view);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(
+    comm,
+    tx_value_first,
+    tx_counts,
+    tx_offsets,
+    tx_dst_ranks,
+    get_dataframe_buffer_begin<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_value_buffer),
+    rx_counts,
+    rx_offsets,
+    rx_src_ranks,
+    stream_view);
+
+  if (rx_counts.size() < static_cast<size_t>(comm_size)) {
+    std::vector<size_t> tmp_rx_counts(comm_size, size_t{0});
+    for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
+      assert(rx_src_ranks[i] < comm_size);
+      tmp_rx_counts[rx_src_ranks[i]] = rx_counts[i];
+    }
+    rx_counts = std::move(tmp_rx_counts);
+  }
+
+  return std::make_tuple(std::move(rx_keys), std::move(rx_value_buffer), rx_counts);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/utilities/thrust_tuple_utils.cuh b/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh
similarity index 81%
rename from cpp/include/utilities/thrust_tuple_utils.cuh
rename to cpp/include/cugraph/utilities/thrust_tuple_utils.cuh
index 0ad71ba5e05..ddc325b6bbb 100644
--- a/cpp/include/utilities/thrust_tuple_utils.cuh
+++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -60,27 +61,6 @@ struct compute_thrust_tuple_element_sizes_impl<TupleType, I, I> {
   void compute(std::array<size_t, thrust::tuple_size<TupleType>::value>& arr) const {}
 };
 
-template <typename TupleType, size_t... Is>
-__device__ constexpr auto remove_first_thrust_tuple_element_impl(TupleType const& tuple,
-                                                                 std::index_sequence<Is...>)
-{
-  return thrust::make_tuple(thrust::get<1 + Is>(tuple)...);
-}
-
-template <typename TupleType, size_t I, size_t N>
-struct plus_thrust_tuple_impl {
-  __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const
-  {
-    thrust::get<I>(lhs) += thrust::get<I>(rhs);
-    plus_thrust_tuple_impl<TupleType, I + 1, N>().compute(lhs, rhs);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct plus_thrust_tuple_impl<TupleType, I, I> {
-  __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const {}
-};
-
 template <typename T>
 __device__ std::enable_if_t<std::is_arithmetic<T>::value, void> atomic_accumulate_impl(
   thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs,
@@ -199,28 +179,6 @@ struct compute_thrust_tuple_element_sizes {
   }
 };
 
-template <typename TupleType>
-struct remove_first_thrust_tuple_element {
-  __device__ constexpr auto operator()(TupleType const& tuple) const
-  {
-    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
-    return detail::remove_first_thrust_tuple_element_impl(
-      tuple, std::make_index_sequence<tuple_size - 1>());
-  }
-};
-
-template <typename TupleType>
-struct plus_thrust_tuple {
-  __host__ __device__ constexpr TupleType operator()(TupleType const& lhs,
-                                                     TupleType const& rhs) const
-  {
-    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
-    auto ret                    = lhs;
-    detail::plus_thrust_tuple_impl<TupleType, size_t{0}, tuple_size>().compute(ret, rhs);
-    return ret;
-  }
-};
-
 template <typename Iterator, typename TupleType>
 struct atomic_accumulate_thrust_tuple {
   __device__ constexpr void operator()(Iterator iter, TupleType const& value) const
diff --git a/cpp/include/vertex_partition_device.cuh b/cpp/include/cugraph/vertex_partition_device.cuh
similarity index 92%
rename from cpp/include/vertex_partition_device.cuh
rename to cpp/include/cugraph/vertex_partition_device.cuh
index a6a78ad3878..f598c7d89d8 100644
--- a/cpp/include/vertex_partition_device.cuh
+++ b/cpp/include/cugraph/vertex_partition_device.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <experimental/graph_view.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <type_traits>
 
@@ -66,8 +66,8 @@ class vertex_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::i
   {
   }
 
-  __host__ __device__ bool is_local_vertex_nocheck(typename GraphViewType::vertex_type v) const
-    noexcept
+  __host__ __device__ bool is_local_vertex_nocheck(
+    typename GraphViewType::vertex_type v) const noexcept
   {
     return (v >= first_) && (v < last_);
   }
diff --git a/cpp/include/cugraph/vertex_partition_device_view.cuh b/cpp/include/cugraph/vertex_partition_device_view.cuh
new file mode 100644
index 00000000000..39eb3e7238a
--- /dev/null
+++ b/cpp/include/cugraph/vertex_partition_device_view.cuh
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/error.hpp>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename vertex_t>
+class vertex_partition_device_view_base_t {
+ public:
+  vertex_partition_device_view_base_t(vertex_t number_of_vertices)
+    : number_of_vertices_(number_of_vertices)
+  {
+  }
+
+  template <typename vertex_type = vertex_t>
+  __host__ __device__ std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(
+    vertex_type v) const noexcept
+  {
+    return ((v >= 0) && (v < number_of_vertices_));
+  }
+
+  template <typename vertex_type = vertex_t>
+  __host__ __device__ std::enable_if_t<std::is_unsigned<vertex_type>::value, bool> is_valid_vertex(
+    vertex_type v) const noexcept
+  {
+    return (v < number_of_vertices_);
+  }
+
+ private:
+  // should be trivially copyable to device
+  vertex_t number_of_vertices_{0};
+};
+
+}  // namespace detail
+
+template <typename GraphViewType, bool multi_gpu, typename Enable = void>
+class vertex_partition_device_view_t;
+
+// multi-GPU version
+template <typename vertex_t, bool multi_gpu>
+class vertex_partition_device_view_t<vertex_t, multi_gpu, std::enable_if_t<multi_gpu>>
+  : public detail::vertex_partition_device_view_base_t<vertex_t> {
+ public:
+  vertex_partition_device_view_t(vertex_partition_view_t<vertex_t, multi_gpu> view)
+    : detail::vertex_partition_device_view_base_t<vertex_t>(view.get_number_of_vertices()),
+      local_vertex_first_(view.get_local_vertex_first()),
+      local_vertex_last_(view.get_local_vertex_last())
+  {
+  }
+
+  __host__ __device__ bool is_local_vertex_nocheck(vertex_t v) const noexcept
+  {
+    return (v >= local_vertex_first_) && (v < local_vertex_last_);
+  }
+
+  __host__ __device__ vertex_t
+  get_local_vertex_offset_from_vertex_nocheck(vertex_t v) const noexcept
+  {
+    return v - local_vertex_first_;
+  }
+
+ private:
+  // should be trivially copyable to device
+  vertex_t local_vertex_first_{0};
+  vertex_t local_vertex_last_{0};
+};
+
+// single-GPU version
+template <typename vertex_t, bool multi_gpu>
+class vertex_partition_device_view_t<vertex_t, multi_gpu, std::enable_if_t<!multi_gpu>>
+  : public detail::vertex_partition_device_view_base_t<vertex_t> {
+ public:
+  vertex_partition_device_view_t(vertex_partition_view_t<vertex_t, multi_gpu> view)
+    : detail::vertex_partition_device_view_base_t<vertex_t>(view.get_number_of_vertices())
+  {
+  }
+
+  __host__ __device__ constexpr bool is_local_vertex_nocheck(vertex_t v) const noexcept
+  {
+    return true;
+  }
+
+  __host__ __device__ constexpr vertex_t get_local_vertex_offset_from_vertex_nocheck(
+    vertex_t v) const noexcept
+  {
+    return v;
+  }
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/vertex_partition_view.hpp b/cpp/include/cugraph/vertex_partition_view.hpp
new file mode 100644
index 00000000000..51badf162eb
--- /dev/null
+++ b/cpp/include/cugraph/vertex_partition_view.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename vertex_t>
+class vertex_partition_view_base_t {
+ public:
+  vertex_partition_view_base_t(vertex_t number_of_vertices)
+    : number_of_vertices_(number_of_vertices)
+  {
+  }
+
+  vertex_t get_number_of_vertices() const { return number_of_vertices_; }
+
+ private:
+  vertex_t number_of_vertices_{0};
+};
+
+}  // namespace detail
+
+template <typename vertex_t, bool multi_gpu, typename Enable = void>
+class vertex_partition_view_t;
+
+// multi-GPU version
+template <typename vertex_t, bool multi_gpu>
+class vertex_partition_view_t<vertex_t, multi_gpu, std::enable_if_t<multi_gpu>>
+  : public detail::vertex_partition_view_base_t<vertex_t> {
+ public:
+  vertex_partition_view_t(vertex_t number_of_vertices,
+                          vertex_t local_vertex_first,
+                          vertex_t local_vertex_last)
+    : detail::vertex_partition_view_base_t<vertex_t>(number_of_vertices),
+      local_vertex_first_(local_vertex_first),
+      local_vertex_last_(local_vertex_last)
+  {
+  }
+
+  vertex_t get_local_vertex_first() const { return local_vertex_first_; }
+  vertex_t get_local_vertex_last() const { return local_vertex_last_; }
+
+ private:
+  vertex_t local_vertex_first_{0};
+  vertex_t local_vertex_last_{0};
+};
+
+// single-GPU version
+template <typename vertex_t, bool multi_gpu>
+class vertex_partition_view_t<vertex_t, multi_gpu, std::enable_if_t<!multi_gpu>>
+  : public detail::vertex_partition_view_base_t<vertex_t> {
+ public:
+  vertex_partition_view_t(vertex_t number_of_vertices)
+    : detail::vertex_partition_view_base_t<vertex_t>(number_of_vertices)
+  {
+  }
+
+  vertex_t get_local_vertex_first() const { return vertex_t{0}; }
+  vertex_t get_local_vertex_last() const { return this->get_number_of_vertices(); }
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/bfs_visitor.hpp b/cpp/include/cugraph/visitors/bfs_visitor.hpp
new file mode 100644
index 00000000000..75b6d9169f0
--- /dev/null
+++ b/cpp/include/cugraph/visitors/bfs_visitor.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+#include "erased_pack.hpp"
+#include "graph_envelope.hpp"
+#include "ret_terased.hpp"
+
+namespace cugraph {
+namespace visitors {
+
+using namespace cugraph::experimental;
+
+// macro option: MAKE_VISITOR(bfs)
+
+// primary empty template:
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool st,
+          bool mg,
+          typename Enable = void>
+struct bfs_visitor;
+
+// dummy out non-candidate instantiation paths:
+//
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+struct bfs_visitor<vertex_t,
+                   edge_t,
+                   weight_t,
+                   st,
+                   mg,
+                   std::enable_if_t<(!is_candidate<vertex_t, edge_t, weight_t>::value)>>
+  : visitor_t {
+  void visit_graph(graph_envelope_t::base_graph_t const&) override
+  {
+    // purposely empty
+  }
+  return_t const& get_result(void) const override
+  {
+    static return_t r{};
+    return r;
+  }
+};
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+struct bfs_visitor<vertex_t,
+                   edge_t,
+                   weight_t,
+                   st,
+                   mg,
+                   std::enable_if_t<is_candidate<vertex_t, edge_t, weight_t>::value>> : visitor_t {
+  bfs_visitor(erased_pack_t& ep) : ep_(ep) {}
+
+  void visit_graph(graph_envelope_t::base_graph_t const&) override;
+
+  return_t const& get_result(void) const override { return result_; }
+
+ private:
+  erased_pack_t& ep_;
+  return_t result_;
+};
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/cascaded_dispatch.hpp b/cpp/include/cugraph/visitors/cascaded_dispatch.hpp
new file mode 100755
index 00000000000..b513c364ce7
--- /dev/null
+++ b/cpp/include/cugraph/visitors/cascaded_dispatch.hpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "enum_mapping.hpp"
+#include "graph_enum_mapping.hpp"
+
+#include <cugraph/utilities/graph_traits.hpp>
+#include "graph_factory.hpp"
+
+namespace cugraph {
+namespace visitors {
+
+using namespace cugraph::experimental;
+using pair_uniques_t = graph_envelope_t::pair_uniques_t;
+
+// dummy-out non-candidate paths:
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool tr,
+          bool mg,
+          std::enable_if_t<!is_candidate<vertex_t, edge_t, weight_t>::value, void*> = nullptr>
+constexpr pair_uniques_t graph_dispatcher(GTypes graph_type, erased_pack_t& ep)
+{
+  /// return nullptr;
+  return pair_uniques_t{nullptr, nullptr};
+}
+
+// final step of cascading: calls factory on erased pack:
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool tr,
+          bool mg,
+          std::enable_if_t<is_candidate<vertex_t, edge_t, weight_t>::value, void*> = nullptr>
+constexpr pair_uniques_t graph_dispatcher(GTypes graph_type, erased_pack_t& ep)
+{
+  switch (graph_type) {
+    case GTypes::GRAPH_T: {
+      using graph_t = typename GMapType<vertex_t, edge_t, weight_t, tr, mg, GTypes::GRAPH_T>::type;
+      graph_factory_t<graph_t> factory;
+
+      pair_uniques_t p_uniques =
+        std::make_pair(factory.make_graph(ep),
+                       std::make_unique<dependent_factory_t<vertex_t, edge_t, weight_t, tr, mg>>());
+
+      return p_uniques;
+    } break;
+
+    default: {
+      std::stringstream ss;
+      ss << "ERROR: Unknown type enum:" << static_cast<int>(graph_type);
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+
+// multi_gpu bool dispatcher:
+// resolves bool `multi_gpu`
+// and using template arguments vertex_t, edge_t, weight_t, store_transpose
+// cascades into next level
+// graph_dispatcher()
+//
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+constexpr decltype(auto) multi_gpu_dispatcher(bool multi_gpu, GTypes graph_type, erased_pack_t& ep)
+{
+  switch (multi_gpu) {
+    case true: {
+      return graph_dispatcher<vertex_t, edge_t, weight_t, store_transposed, true>(graph_type, ep);
+    } break;
+    case false: {
+      return graph_dispatcher<vertex_t, edge_t, weight_t, store_transposed, false>(graph_type, ep);
+    }
+  }
+}
+
+// transpose bool dispatcher:
+// resolves bool `store_transpose`
+// and using template arguments vertex_t, edge_t, weight_t
+// cascades into next level
+// multi_gpu_dispatcher()
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+constexpr decltype(auto) transp_dispatcher(bool store_transposed,
+                                           bool multi_gpu,
+                                           GTypes graph_type,
+                                           erased_pack_t& ep)
+{
+  switch (store_transposed) {
+    case true: {
+      return multi_gpu_dispatcher<vertex_t, edge_t, weight_t, true>(multi_gpu, graph_type, ep);
+    } break;
+    case false: {
+      return multi_gpu_dispatcher<vertex_t, edge_t, weight_t, false>(multi_gpu, graph_type, ep);
+    }
+  }
+}
+
+// weight type dispatcher:
+// resolves weigth_t from weight_type enum
+// and using template arguments vertex_t, edge_t
+// cascades into next level
+// transp_dispatcher()
+//
+template <typename vertex_t, typename edge_t>
+constexpr decltype(auto) weight_dispatcher(
+  DTypes weight_type, bool store_transposed, bool multi_gpu, GTypes graph_type, erased_pack_t& ep)
+{
+  switch (weight_type) {
+    case DTypes::INT32: {
+      using weight_t = typename DMapType<DTypes::INT32>::type;
+      return transp_dispatcher<vertex_t, edge_t, weight_t>(
+        store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::INT64: {
+      using weight_t = typename DMapType<DTypes::INT64>::type;
+      return transp_dispatcher<vertex_t, edge_t, weight_t>(
+        store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::FLOAT32: {
+      using weight_t = typename DMapType<DTypes::FLOAT32>::type;
+      return transp_dispatcher<vertex_t, edge_t, weight_t>(
+        store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::FLOAT64: {
+      using weight_t = typename DMapType<DTypes::FLOAT64>::type;
+      return transp_dispatcher<vertex_t, edge_t, weight_t>(
+        store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    default: {
+      std::stringstream ss;
+      ss << "ERROR: Unknown type enum:" << static_cast<int>(weight_type);
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+
+// edge type dispatcher:
+// resolves edge_t from edge_type enum
+// and using template argument vertex_t
+// cascades into the next level
+// weight_dispatcher();
+//
+template <typename vertex_t>
+constexpr decltype(auto) edge_dispatcher(DTypes edge_type,
+                                         DTypes weight_type,
+                                         bool store_transposed,
+                                         bool multi_gpu,
+                                         GTypes graph_type,
+                                         erased_pack_t& ep)
+{
+  switch (edge_type) {
+    case DTypes::INT32: {
+      using edge_t = typename DMapType<DTypes::INT32>::type;
+      return weight_dispatcher<vertex_t, edge_t>(
+        weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::INT64: {
+      using edge_t = typename DMapType<DTypes::INT64>::type;
+      return weight_dispatcher<vertex_t, edge_t>(
+        weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::FLOAT32: {
+      using edge_t = typename DMapType<DTypes::FLOAT32>::type;
+      return weight_dispatcher<vertex_t, edge_t>(
+        weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::FLOAT64: {
+      using edge_t = typename DMapType<DTypes::FLOAT64>::type;
+      return weight_dispatcher<vertex_t, edge_t>(
+        weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    default: {
+      std::stringstream ss;
+      ss << "ERROR: Unknown type enum:" << static_cast<int>(edge_type);
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+
+// vertex type dispatcher:
+// entry point,
+// resolves vertex_t from vertex_type enum
+// and  cascades into the next level
+// edge_dispatcher();
+//
+inline decltype(auto) vertex_dispatcher(DTypes vertex_type,
+                                        DTypes edge_type,
+                                        DTypes weight_type,
+                                        bool store_transposed,
+                                        bool multi_gpu,
+                                        GTypes graph_type,
+                                        erased_pack_t& ep)
+{
+  switch (vertex_type) {
+    case DTypes::INT32: {
+      using vertex_t = typename DMapType<DTypes::INT32>::type;
+      return edge_dispatcher<vertex_t>(
+        edge_type, weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::INT64: {
+      using vertex_t = typename DMapType<DTypes::INT64>::type;
+      return edge_dispatcher<vertex_t>(
+        edge_type, weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::FLOAT32: {
+      using vertex_t = typename DMapType<DTypes::FLOAT32>::type;
+      return edge_dispatcher<vertex_t>(
+        edge_type, weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    case DTypes::FLOAT64: {
+      using vertex_t = typename DMapType<DTypes::FLOAT64>::type;
+      return edge_dispatcher<vertex_t>(
+        edge_type, weight_type, store_transposed, multi_gpu, graph_type, ep);
+    } break;
+    default: {
+      std::stringstream ss;
+      ss << "ERROR: Unknown type enum:" << static_cast<int>(vertex_type);
+      throw std::runtime_error(ss.str());
+    }
+  }
+}
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/enum_mapping.hpp b/cpp/include/cugraph/visitors/enum_mapping.hpp
new file mode 100755
index 00000000000..ab72f87bcab
--- /dev/null
+++ b/cpp/include/cugraph/visitors/enum_mapping.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+
+#include <cstdint>
+
+namespace cugraph {
+namespace visitors {
+
+enum class DTypes { INT32 = 0, INT64, FLOAT32, FLOAT64, NTYPES };
+
+template <DTypes>
+struct DMapType;
+
+template <>
+struct DMapType<DTypes::INT32> {
+  using type = int32_t;
+};
+
+template <>
+struct DMapType<DTypes::INT64> {
+  using type = int64_t;
+};
+
+template <>
+struct DMapType<DTypes::FLOAT32> {
+  using type = float;
+};
+
+template <>
+struct DMapType<DTypes::FLOAT64> {
+  using type = double;
+};
+
+template <typename T>
+struct reverse_dmap_t;
+
+template <>
+struct reverse_dmap_t<int32_t> {
+  static constexpr DTypes type_id = DTypes::INT32;
+};
+
+template <>
+struct reverse_dmap_t<int64_t> {
+  static constexpr DTypes type_id = DTypes::INT64;
+};
+
+template <>
+struct reverse_dmap_t<float> {
+  static constexpr DTypes type_id = DTypes::FLOAT32;
+};
+
+template <>
+struct reverse_dmap_t<double> {
+  static constexpr DTypes type_id = DTypes::FLOAT64;
+};
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/erased_api.hpp b/cpp/include/cugraph/visitors/erased_api.hpp
new file mode 100644
index 00000000000..b85ee84bff6
--- /dev/null
+++ b/cpp/include/cugraph/visitors/erased_api.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+/**
+ * @brief Set of type-erased wrappers, following the (almost) universal general signature:
+ * graph_envelope reference; erased_pack_t pack of erased arguments, that the caller is responsible
+ * to set correctly (FIXME: handshake protocol must be put in place); return set;
+ */
+
+#pragma once
+
+namespace cugraph {
+namespace api {
+
+using namespace cugraph::visitors;
+
+/**
+ * @brief Type-erased BFS wrapper.
+ *
+ * @param[in] g graph_envelope reference;
+ * @param[in] ep erased_pack_t pack of erased arguments, that the caller is responsible to set
+ * correctly (FIXME: handshake protocol must be put in place);
+ * @return return set;
+ */
+return_t bfs(graph_envelope_t const& g, erased_pack_t& ep);
+
+// TODO: more to follow...
+
+}  // namespace api
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/erased_pack.hpp b/cpp/include/cugraph/visitors/erased_pack.hpp
new file mode 100644
index 00000000000..eab1310eb02
--- /dev/null
+++ b/cpp/include/cugraph/visitors/erased_pack.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+
+#ifdef _DEBUG_
+#include <iostream>
+#endif
+
+#include <initializer_list>
+#include <vector>
+
+namespace cugraph {
+namespace visitors {
+
+struct erased_pack_t {
+  erased_pack_t(void** p_args, size_t n)
+    : args_{[](void** p, size_t n) {
+        std::vector<void*> v;
+        v.insert(v.begin(), p, p + n);
+        return v;
+      }(p_args, n)}
+  {
+    // args_.insert(args_.begin(), p_args, p_args + n);
+  }
+
+  erased_pack_t(std::initializer_list<void*> args) : args_{args} {}
+
+  std::vector<void*> const& get_args(void) const { return args_; }
+
+  erased_pack_t(erased_pack_t const&) = delete;
+  erased_pack_t& operator=(erased_pack_t const&) = delete;
+
+  erased_pack_t(erased_pack_t&& other) : args_(std::move(other.args_)) {}
+
+  erased_pack_t& operator=(erased_pack_t&& other)
+  {
+    args_ = std::move(other.args_);
+    return *this;
+  }
+
+#ifdef _DEBUG_
+  void print(void) const
+  {
+    std::cout << "list args addresses:\n";
+    for (auto&& elem : args_)
+      std::cout << elem << ", ";
+    std::cout << '\n';
+  }
+#endif
+
+ private:
+  std::vector<void*> args_;
+};
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/graph_enum.hpp b/cpp/include/cugraph/visitors/graph_enum.hpp
new file mode 100755
index 00000000000..eca53035313
--- /dev/null
+++ b/cpp/include/cugraph/visitors/graph_enum.hpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+
+namespace cugraph {
+namespace visitors {
+enum class GTypes { GRAPH_T = 0, GRAPH_VIEW_T, NTYPES };
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/graph_enum_mapping.hpp b/cpp/include/cugraph/visitors/graph_enum_mapping.hpp
new file mode 100755
index 00000000000..dfde78b6b4b
--- /dev/null
+++ b/cpp/include/cugraph/visitors/graph_enum_mapping.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+#include "graph_enum.hpp"
+
+namespace cugraph {
+namespace visitors {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool,
+          bool,
+          GTypes>
+struct GMapType;  // primary template, purposely empty
+
+// partial specializations:
+//
+template <typename vertex_t, typename edge_t, typename weight_t, bool st_tr, bool multi_gpu>
+struct GMapType<vertex_t, edge_t, weight_t, st_tr, multi_gpu, GTypes::GRAPH_T> {
+  using type = graph_t<vertex_t, edge_t, weight_t, st_tr, multi_gpu>;
+};
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/graph_envelope.hpp b/cpp/include/cugraph/visitors/graph_envelope.hpp
new file mode 100755
index 00000000000..d5701088783
--- /dev/null
+++ b/cpp/include/cugraph/visitors/graph_envelope.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+/**
+ * @brief Set of classes abstracting the type-erasure, templates, and template constraints
+ *        to client code that must supply run-time type information (RTTI) and has no template
+constructs.
+ *
+ *  Goal: be able to call an algorithm (say. louvain() on a type erased graph created from RTTI:
+ * {
+ *  auto graph = make_graph(flags...);
+ *  auto res = louvain(graph, params...);
+ * }
+ * params will be also type-erased (or same type regardless of graph-type); and will
+ * be appropriately passed to the Factory and then converted and passed to Visitor constructor
+*/
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+
+#include "enum_mapping.hpp"
+#include "graph_enum.hpp"
+
+#include <cugraph/utilities/graph_traits.hpp>
+
+namespace cugraph {
+namespace visitors {
+
+using namespace cugraph::experimental;
+
+class erased_pack_t;  // forward...
+class return_t;       // forward...
+
+// visitor base, incomplete:
+//
+class visitor_t;  // forward...
+
+// envelope class around all
+// graph classes:
+//
+struct graph_envelope_t {
+  struct base_graph_t {  // necessary to avoid circular dependency
+                         // between graph_base_t and graph_envelope_t
+    virtual ~base_graph_t() {}
+
+    /// virtual void print(void) const = 0;
+
+    virtual void apply(visitor_t& v) const = 0;
+  };
+
+  // abstract factory:
+  //
+  struct visitor_factory_t {
+    virtual std::unique_ptr<visitor_t> make_louvain_visitor(erased_pack_t&) const = 0;
+
+    virtual std::unique_ptr<visitor_t> make_bfs_visitor(erased_pack_t&) const = 0;
+  };
+
+  using pair_uniques_t =
+    std::pair<std::unique_ptr<base_graph_t>, std::unique_ptr<visitor_factory_t>>;
+
+  void apply(visitor_t& v) const
+  {
+    if (p_impl_fact_.first)
+      p_impl_fact_.first->apply(v);
+    else
+      throw std::runtime_error("ERROR: Implementation not allocated.");
+  }
+
+  // void print(void) const
+  // {
+  //   if (p_impl_fact_.first)
+  //     p_impl_fact_.first->print();
+  //   else
+  //     throw std::runtime_error("ERROR: Implementation not allocated.");
+  // }
+
+  std::unique_ptr<base_graph_t> const& graph(void) const { return p_impl_fact_.first; }
+
+  std::unique_ptr<visitor_factory_t> const& factory(void) const { return p_impl_fact_.second; }
+
+  graph_envelope_t(DTypes vertex_tid,
+                   DTypes edge_tid,
+                   DTypes weight_tid,
+                   bool,
+                   bool,
+                   GTypes graph_tid,
+                   erased_pack_t&);
+
+ private:
+  // need it to hide the parameterization of
+  // (graph implementation, factory implementation)
+  // by dependent types: vertex_t, edge_t, weight_t
+  //
+  pair_uniques_t p_impl_fact_;
+};
+
+// visitor base:
+//
+class visitor_t {
+ public:
+  virtual ~visitor_t(void) {}
+
+  virtual void visit_graph(graph_envelope_t::base_graph_t const&) = 0;
+
+  virtual return_t const& get_result(void) const = 0;
+};
+
+// convenience templatized base:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct dependent_graph_t : graph_envelope_t::base_graph_t {
+  using vertex_type = vertex_t;
+  using edge_type   = edge_t;
+  using weight_type = weight_t;
+};
+
+// primary empty template:
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool st,
+          bool mg,
+          typename Enable = void>
+struct dependent_factory_t;
+
+// dummy out non-candidate instantiation paths:
+//
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+struct dependent_factory_t<vertex_t,
+                           edge_t,
+                           weight_t,
+                           st,
+                           mg,
+                           std::enable_if_t<!is_candidate<vertex_t, edge_t, weight_t>::value>>
+  : graph_envelope_t::visitor_factory_t {
+  using vertex_type = vertex_t;
+  using edge_type   = edge_t;
+  using weight_type = weight_t;
+
+  std::unique_ptr<visitor_t> make_louvain_visitor(erased_pack_t&) const override { return nullptr; }
+
+  std::unique_ptr<visitor_t> make_bfs_visitor(erased_pack_t&) const override { return nullptr; }
+};
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+struct dependent_factory_t<vertex_t,
+                           edge_t,
+                           weight_t,
+                           st,
+                           mg,
+                           std::enable_if_t<is_candidate<vertex_t, edge_t, weight_t>::value>>
+  : graph_envelope_t::visitor_factory_t {
+  using vertex_type = vertex_t;
+  using edge_type   = edge_t;
+  using weight_type = weight_t;
+
+  std::unique_ptr<visitor_t> make_louvain_visitor(erased_pack_t&) const override;
+
+  std::unique_ptr<visitor_t> make_bfs_visitor(erased_pack_t&) const override;
+};
+
+// utility factory selector:
+//
+template <typename graph_type>
+std::unique_ptr<visitor_t> make_visitor(
+  graph_type const& tag,  // necessary to extract dependent types
+  std::function<std::unique_ptr<visitor_t>(graph_envelope_t::visitor_factory_t const&,
+                                           erased_pack_t&)>
+    f,  // selector functor that picks up the make memf of the visitor_factory and passes `ep` to it
+  erased_pack_t& ep)  // erased pack of args to be passed to factory
+{
+  using vertex_t    = typename graph_type::vertex_type;
+  using edge_t      = typename graph_type::edge_type;
+  using weight_t    = typename graph_type::weight_type;
+  constexpr bool st = graph_type::is_adj_matrix_transposed;
+  constexpr bool mg = graph_type::is_multi_gpu;
+
+  dependent_factory_t<vertex_t, edge_t, weight_t, st, mg> factory;
+
+  return f(factory, ep);
+}
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/graph_factory.hpp b/cpp/include/cugraph/visitors/graph_factory.hpp
new file mode 100644
index 00000000000..22f53b72b01
--- /dev/null
+++ b/cpp/include/cugraph/visitors/graph_factory.hpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#pragma once
+
+#include <stdexcept>
+#include <tuple>
+
+#include "graph_envelope.hpp"
+// prevent clang-format to rearange order of headers
+#include "erased_pack.hpp"
+//
+// not really needed here;
+// just to make happy the clang-format policy
+// of header inclusion to be order-independent...
+//
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#define _DEBUG_
+
+#ifdef _DEBUG_
+#include <iostream>
+#endif
+
+namespace cugraph {
+namespace visitors {
+
+using namespace cugraph::experimental;
+
+struct graph_factory_base_t {
+  virtual ~graph_factory_base_t(void) {}
+
+  virtual std::unique_ptr<graph_envelope_t::base_graph_t> make_graph(erased_pack_t&) const = 0;
+};
+
+// argument unpacker (from `erased_pack_t`)
+// for graph construction
+//
+template <typename graph_type>
+struct graph_arg_unpacker_t {
+  using vertex_t           = typename graph_type::vertex_type;
+  using edge_t             = typename graph_type::edge_type;
+  using weight_t           = typename graph_type::weight_type;
+  static constexpr bool st = graph_type::is_adj_matrix_transposed;
+  static constexpr bool mg = graph_type::is_multi_gpu;
+
+  void operator()(erased_pack_t& ep,
+                  std::tuple<raft::handle_t const&,
+                             vertex_t*,
+                             vertex_t*,
+                             weight_t*,
+                             vertex_t*,
+                             edge_t,
+                             vertex_t,
+                             edge_t,
+                             bool>& t_args) const
+  {
+  }
+};
+
+// primary template factory; to be (partiallY) specialized;
+// and explicitly instantiated for concrete graphs
+//
+template <typename graph_type>
+struct graph_factory_t : graph_factory_base_t {
+  std::unique_ptr<graph_envelope_t::base_graph_t> make_graph(erased_pack_t&) const override
+  {
+    throw std::runtime_error("Empty factory, not to be called...");
+  }
+};
+
+// Linker PROBLEM (FIXED):
+// dispatcher needs _ALL_ paths instantiated,
+// not just the ones explicitly instantiated
+// (EIDir) in `graph.cpp`
+//
+// Posiible SOLUTIONS:
+//
+// (1.) the _factory_ must provide "dummy"
+//      instantiations for paths not needed;
+//
+// or:
+//
+// (2.) (Adopted solution)
+//      the _dispatcher_ (graph_dispatcher())
+//      must provide empty implementation
+//      for the instantiations that are not needed; (Done!)
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+struct graph_factory_t<
+  graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>>
+  : graph_factory_base_t {
+  std::unique_ptr<graph_envelope_t::base_graph_t> make_graph(erased_pack_t& ep) const override
+  {
+    /// std::cout << "Multi-GPU factory.\n";
+    std::vector<void*> const& v_args{ep.get_args()};
+
+    // invoke cnstr. using cython arg pack:
+    //
+    assert(v_args.size() == 9);
+
+#ifdef _DEBUG_
+    std::cout << "Enter graph factory...\n";
+#endif
+
+    // cnstr. args unpacking:
+    //
+    raft::handle_t const& handle = *static_cast<raft::handle_t const*>(v_args[0]);
+
+    vertex_t* src_vertices             = static_cast<vertex_t*>(v_args[1]);
+    vertex_t* dst_vertices             = static_cast<vertex_t*>(v_args[2]);
+    weight_t* weights                  = static_cast<weight_t*>(v_args[3]);
+    vertex_t* vertex_partition_offsets = static_cast<vertex_t*>(v_args[4]);
+    edge_t num_partition_edges         = *static_cast<edge_t*>(v_args[5]);
+    vertex_t num_global_vertices       = *static_cast<vertex_t*>(v_args[6]);
+    edge_t num_global_edges            = *static_cast<edge_t*>(v_args[7]);
+    bool sorted_by_degree              = *static_cast<bool*>(v_args[8]);
+
+    // TODO: un-hardcode: have it passed int `ep`
+    //
+    graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false};
+    bool do_expensive_check{false};  // FIXME: check what should this default to
+
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();  // pcols
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();  // prows
+
+    std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelist(
+      {{src_vertices, dst_vertices, weights, num_partition_edges}});
+
+    std::vector<vertex_t> partition_offsets_vector(
+      vertex_partition_offsets, vertex_partition_offsets + (row_comm_size * col_comm_size) + 1);
+
+    partition_t<vertex_t> partition(
+      partition_offsets_vector, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank);
+
+    std::optional<std::vector<vertex_t>>
+      opt_seg_off{};  // FIXME: may needd to pass/extract segment_offsets vector
+
+    return std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+      handle,
+      edgelist,
+      partition,
+      num_global_vertices,
+      num_global_edges,
+      graph_props,
+      opt_seg_off,
+      do_expensive_check);
+  }
+};
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+struct graph_factory_t<
+  graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>>
+  : graph_factory_base_t {
+  std::unique_ptr<graph_envelope_t::base_graph_t> make_graph(erased_pack_t& ep) const override
+  {
+    /// std::cout << "Single-GPU factory.\n";
+    std::vector<void*> const& v_args{ep.get_args()};
+
+    assert(v_args.size() == 6);
+
+    raft::handle_t const& handle = *static_cast<raft::handle_t const*>(v_args[0]);
+
+    auto const& elist = *static_cast<edgelist_t<vertex_t, edge_t, weight_t> const*>(v_args[1]);
+
+    auto nv = *static_cast<vertex_t*>(v_args[2]);
+
+    auto props = *static_cast<graph_properties_t*>(v_args[3]);
+
+    bool sorted = *static_cast<bool*>(v_args[4]);  // FIXME: no need to pass this!
+
+    bool check = *static_cast<bool*>(v_args[5]);
+
+    std::optional<std::vector<vertex_t>> opt_seg_off{};  // should not be needed for (!multi_gpu)
+
+    return std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+      handle, elist, nv, props, opt_seg_off, check);
+  }
+};
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/visitors/ret_terased.hpp b/cpp/include/cugraph/visitors/ret_terased.hpp
new file mode 100644
index 00000000000..19a7920e81b
--- /dev/null
+++ b/cpp/include/cugraph/visitors/ret_terased.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+
+namespace cugraph {
+namespace visitors {
+
+struct return_t {
+  struct base_return_t {
+    virtual ~base_return_t(void) {}
+
+    virtual void copy(return_t const&)                       = 0;
+    virtual std::unique_ptr<base_return_t> clone(void) const = 0;
+  };
+
+  template <typename T>
+  struct generic_return_t : base_return_t {
+    generic_return_t(T const& t) : return_(t) {}
+
+    void copy(return_t const& r) override
+    {
+      base_return_t const* p_B = static_cast<base_return_t const*>(r.p_impl_.get());
+      return_                  = *(dynamic_cast<T const*>(p_B));
+    }
+
+    std::unique_ptr<base_return_t> clone(void) const override
+    {
+      return std::make_unique<generic_return_t<T>>(return_);
+    }
+
+    T const& get(void) const { return return_; }
+
+   private:
+    T return_;
+  };
+
+  return_t(void) = default;
+
+  template <typename T>
+  return_t(T const& t) : p_impl_(std::make_unique<generic_return_t<T>>(t))
+  {
+  }
+
+  return_t(return_t const& r) : p_impl_{r.clone()} {}
+
+  return_t& operator=(return_t const& r)
+  {
+    p_impl_ = r.clone();
+    return *this;
+  }
+
+  return_t(return_t&& other) : p_impl_(std::move(other.p_impl_)) {}
+  return_t& operator=(return_t&& other)
+  {
+    p_impl_ = std::move(other.p_impl_);
+    return *this;
+  }
+
+  std::unique_ptr<base_return_t> clone(void) const
+  {
+    if (p_impl_)
+      return p_impl_->clone();
+    else
+      return nullptr;
+  }
+
+  template <typename T>
+  T get(void) const
+  {
+    if (p_impl_) {
+      generic_return_t<T> const* p = static_cast<generic_return_t<T> const*>(p_impl_.get());
+      return p->get();
+    } else
+      throw std::runtime_error("ERROR: nullptr impl.");
+  }
+
+ private:
+  std::unique_ptr<base_return_t> p_impl_;
+};
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh
deleted file mode 100644
index bf56b2e6f80..00000000000
--- a/cpp/include/experimental/detail/graph_utils.cuh
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph_view.hpp>
-#include <partition_manager.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-#include <algorithm>
-#include <vector>
-
-namespace cugraph {
-namespace experimental {
-namespace detail {
-
-// compute the numbers of nonzeros in rows (of the graph adjacency matrix, if store_transposed =
-// false) or columns (of the graph adjacency matrix, if store_transposed = true)
-template <typename vertex_t, typename edge_t>
-rmm::device_uvector<edge_t> compute_major_degree(
-  raft::handle_t const &handle,
-  std::vector<edge_t const *> const &adj_matrix_partition_offsets,
-  partition_t<vertex_t> const &partition)
-{
-  auto &row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_comm_rank = row_comm.get_rank();
-  auto const row_comm_size = row_comm.get_size();
-  auto &col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_comm_rank = col_comm.get_rank();
-  auto const col_comm_size = col_comm.get_size();
-
-  rmm::device_uvector<edge_t> local_degrees(0, handle.get_stream());
-  rmm::device_uvector<edge_t> degrees(0, handle.get_stream());
-
-  vertex_t max_num_local_degrees{0};
-  for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-       ++i) {
-    auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                  ? static_cast<size_t>(i * row_comm_size + row_comm_rank)
-                                  : static_cast<size_t>(col_comm_rank * row_comm_size + i);
-    auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx);
-    max_num_local_degrees      = std::max(max_num_local_degrees, vertex_partition_size);
-    if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) {
-      degrees.resize(vertex_partition_size, handle.get_stream());
-    }
-  }
-  local_degrees.resize(max_num_local_degrees, handle.get_stream());
-  for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-       ++i) {
-    auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                  ? static_cast<size_t>(i * row_comm_size + row_comm_rank)
-                                  : static_cast<size_t>(col_comm_rank * row_comm_size + i);
-    vertex_t major_first{};
-    vertex_t major_last{};
-    std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
-    auto p_offsets =
-      partition.is_hypergraph_partitioned()
-        ? adj_matrix_partition_offsets[i]
-        : adj_matrix_partition_offsets[0] +
-            (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size));
-    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                      thrust::make_counting_iterator(vertex_t{0}),
-                      thrust::make_counting_iterator(major_last - major_first),
-                      local_degrees.data(),
-                      [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
-    if (partition.is_hypergraph_partitioned()) {
-      col_comm.reduce(local_degrees.data(),
-                      i == col_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      static_cast<size_t>(major_last - major_first),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-    } else {
-      row_comm.reduce(local_degrees.data(),
-                      i == row_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      static_cast<size_t>(major_last - major_first),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-    }
-  }
-
-  raft::comms::status_t status{};
-  if (partition.is_hypergraph_partitioned()) {
-    status =
-      col_comm.sync_stream(handle.get_stream());  // this is neessary as local_degrees will become
-                                                  // out-of-scope once this function returns.
-  } else {
-    status =
-      row_comm.sync_stream(handle.get_stream());  // this is neessary as local_degrees will become
-                                                  // out-of-scope once this function returns.
-  }
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-
-  return degrees;
-}
-
-// compute the numbers of nonzeros in rows (of the graph adjacency matrix, if store_transposed =
-// false) or columns (of the graph adjacency matrix, if store_transposed = true)
-template <typename vertex_t, typename edge_t>
-rmm::device_uvector<edge_t> compute_major_degree(
-  raft::handle_t const &handle,
-  std::vector<rmm::device_uvector<edge_t>> const &adj_matrix_partition_offsets,
-  partition_t<vertex_t> const &partition)
-{
-  // we can avoid creating this temporary with "if constexpr" supported from C++17
-  std::vector<edge_t const *> tmp_offsets(adj_matrix_partition_offsets.size(), nullptr);
-  std::transform(adj_matrix_partition_offsets.begin(),
-                 adj_matrix_partition_offsets.end(),
-                 tmp_offsets.begin(),
-                 [](auto const &offsets) { return offsets.data(); });
-  return compute_major_degree(handle, tmp_offsets, partition);
-}
-
-template <typename vertex_t, typename edge_t>
-struct degree_from_offsets_t {
-  edge_t const *offsets{nullptr};
-
-  __device__ edge_t operator()(vertex_t v) { return offsets[v + 1] - offsets[v]; }
-};
-
-}  // namespace detail
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh
deleted file mode 100644
index 53796530f60..00000000000
--- a/cpp/include/matrix_partition_device.cuh
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph_view.hpp>
-#include <utilities/error.hpp>
-
-#include <thrust/tuple.h>
-
-#include <type_traits>
-
-namespace cugraph {
-namespace experimental {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-class matrix_partition_device_base_t {
- public:
-  matrix_partition_device_base_t(edge_t const* offsets,
-                                 vertex_t const* indices,
-                                 weight_t const* weights)
-    : offsets_(offsets), indices_(indices), weights_(weights)
-  {
-  }
-
-  __device__ thrust::tuple<vertex_t const*, weight_t const*, edge_t> get_local_edges(
-    vertex_t major_offset) const noexcept
-  {
-    auto edge_offset  = *(offsets_ + major_offset);
-    auto local_degree = *(offsets_ + (major_offset + 1)) - edge_offset;
-    auto indices      = indices_ + edge_offset;
-    auto weights      = weights_ != nullptr ? weights_ + edge_offset : nullptr;
-    return thrust::make_tuple(indices, weights, local_degree);
-  }
-
-  __device__ edge_t get_local_degree(vertex_t major_offset) const noexcept
-  {
-    return *(offsets_ + (major_offset + 1)) - *(offsets_ + major_offset);
-  }
-
- private:
-  // should be trivially copyable to device
-  edge_t const* offsets_{nullptr};
-  vertex_t const* indices_{nullptr};
-  weight_t const* weights_{nullptr};
-};
-
-template <typename GraphViewType, typename Enable = void>
-class matrix_partition_device_t;
-
-// multi-GPU version
-template <typename GraphViewType>
-class matrix_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::is_multi_gpu>>
-  : public matrix_partition_device_base_t<typename GraphViewType::vertex_type,
-                                          typename GraphViewType::edge_type,
-                                          typename GraphViewType::weight_type> {
- public:
-  matrix_partition_device_t(GraphViewType const& graph_view, size_t partition_idx)
-    : matrix_partition_device_base_t<typename GraphViewType::vertex_type,
-                                     typename GraphViewType::edge_type,
-                                     typename GraphViewType::weight_type>(
-        graph_view.offsets(partition_idx),
-        graph_view.indices(partition_idx),
-        graph_view.weights(partition_idx)),
-      major_first_(GraphViewType::is_adj_matrix_transposed
-                     ? graph_view.get_local_adj_matrix_partition_col_first(partition_idx)
-                     : graph_view.get_local_adj_matrix_partition_row_first(partition_idx)),
-      major_last_(GraphViewType::is_adj_matrix_transposed
-                    ? graph_view.get_local_adj_matrix_partition_col_last(partition_idx)
-                    : graph_view.get_local_adj_matrix_partition_row_last(partition_idx)),
-      minor_first_(GraphViewType::is_adj_matrix_transposed
-                     ? graph_view.get_local_adj_matrix_partition_row_first(partition_idx)
-                     : graph_view.get_local_adj_matrix_partition_col_first(partition_idx)),
-      minor_last_(GraphViewType::is_adj_matrix_transposed
-                    ? graph_view.get_local_adj_matrix_partition_row_last(partition_idx)
-                    : graph_view.get_local_adj_matrix_partition_col_last(partition_idx)),
-      major_value_start_offset_(
-        GraphViewType::is_adj_matrix_transposed
-          ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(partition_idx)
-          : graph_view.get_local_adj_matrix_partition_row_value_start_offset(partition_idx))
-  {
-  }
-
-  typename GraphViewType::vertex_type get_major_value_start_offset() const
-  {
-    return major_value_start_offset_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_first() const noexcept
-  {
-    return major_first_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_last() const noexcept
-  {
-    return major_last_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_size() const noexcept
-  {
-    return major_last_ - major_first_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_first() const noexcept
-  {
-    return minor_first_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_last() const noexcept
-  {
-    return minor_last_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_size() const noexcept
-  {
-    return minor_last_ - minor_first_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_offset_from_major_nocheck(
-    typename GraphViewType::vertex_type major) const noexcept
-  {
-    return major - major_first_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_offset_from_minor_nocheck(
-    typename GraphViewType::vertex_type minor) const noexcept
-  {
-    return minor - minor_first_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_from_major_offset_nocheck(
-    typename GraphViewType::vertex_type major_offset) const noexcept
-  {
-    return major_first_ + major_offset;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_from_minor_offset_nocheck(
-    typename GraphViewType::vertex_type minor_offset) const noexcept
-  {
-    return minor_first_ + minor_offset;
-  }
-
- private:
-  // should be trivially copyable to device
-  typename GraphViewType::vertex_type major_first_{0};
-  typename GraphViewType::vertex_type major_last_{0};
-  typename GraphViewType::vertex_type minor_first_{0};
-  typename GraphViewType::vertex_type minor_last_{0};
-
-  typename GraphViewType::vertex_type major_value_start_offset_{0};
-};
-
-// single-GPU version
-template <typename GraphViewType>
-class matrix_partition_device_t<GraphViewType, std::enable_if_t<!GraphViewType::is_multi_gpu>>
-  : public matrix_partition_device_base_t<typename GraphViewType::vertex_type,
-                                          typename GraphViewType::edge_type,
-                                          typename GraphViewType::weight_type> {
- public:
-  matrix_partition_device_t(GraphViewType const& graph_view, size_t partition_idx)
-    : matrix_partition_device_base_t<typename GraphViewType::vertex_type,
-                                     typename GraphViewType::edge_type,
-                                     typename GraphViewType::weight_type>(
-        graph_view.offsets(), graph_view.indices(), graph_view.weights()),
-      number_of_vertices_(graph_view.get_number_of_vertices())
-  {
-    assert(partition_idx == 0);
-  }
-
-  typename GraphViewType::vertex_type get_major_value_start_offset() const
-  {
-    return typename GraphViewType::vertex_type{0};
-  }
-
-  __host__ __device__ constexpr typename GraphViewType::vertex_type get_major_first() const noexcept
-  {
-    return typename GraphViewType::vertex_type{0};
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_last() const noexcept
-  {
-    return number_of_vertices_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_size() const noexcept
-  {
-    return number_of_vertices_;
-  }
-
-  __host__ __device__ constexpr typename GraphViewType::vertex_type get_minor_first() const noexcept
-  {
-    return typename GraphViewType::vertex_type{0};
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_last() const noexcept
-  {
-    return number_of_vertices_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_size() const noexcept
-  {
-    return number_of_vertices_;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_offset_from_major_nocheck(
-    typename GraphViewType::vertex_type major) const noexcept
-  {
-    return major;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_offset_from_minor_nocheck(
-    typename GraphViewType::vertex_type minor) const noexcept
-  {
-    return minor;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_major_from_major_offset_nocheck(
-    typename GraphViewType::vertex_type major_offset) const noexcept
-  {
-    return major_offset;
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type get_minor_from_minor_offset_nocheck(
-    typename GraphViewType::vertex_type minor_offset) const noexcept
-  {
-    return minor_offset;
-  }
-
- private:
-  typename GraphViewType::vertex_type number_of_vertices_;
-};
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/patterns/any_of_adj_matrix_row.cuh b/cpp/include/patterns/any_of_adj_matrix_row.cuh
deleted file mode 100644
index 199e7c230ef..00000000000
--- a/cpp/include/patterns/any_of_adj_matrix_row.cuh
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <raft/handle.hpp>
-
-#include <thrust/count.h>
-#include <thrust/execution_policy.h>
-
-namespace cugraph {
-namespace experimental {
-
-/**
- * @brief Check any of graph adjacency matrix row properties satisfy the given predicate.
- *
- * Returns true if @p row_op returns true for at least once (in any process in multi-GPU), returns
- * false otherwise. This function is inspired by thrust::any_of().
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam RowOp Type of the unary predicate operator.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row properties
- * for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param row_op Unary predicate operator that takes *(@p adj_matrix_row_value_input_first + i)
- * (where i = [0, @p graph_view.get_number_of_local_adj_matrix_partition_rows()) and returns either
- * true or false.
- * @return true If the predicate returns true at least once (in any process in multi-GPU).
- * @return false If the predicate never returns true (in any process in multi-GPU).
- */
-template <typename GraphViewType, typename AdjMatrixRowValueInputIterator, typename RowOp>
-bool any_of_adj_matrix_row(raft::handle_t const& handle,
-                           GraphViewType const& graph_view,
-                           AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-                           RowOp row_op)
-{
-  // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
-  auto count = thrust::count_if(
-    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-    adj_matrix_row_value_input_first,
-    adj_matrix_row_value_input_first + graph_view.get_number_of_local_adj_matrix_partition_rows(),
-    row_op);
-  if (GraphViewType::is_multi_gpu) {
-    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
-  }
-  return (count > 0);
-}
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/patterns/count_if_e.cuh b/cpp/include/patterns/count_if_e.cuh
deleted file mode 100644
index 4f0f0a7a43e..00000000000
--- a/cpp/include/patterns/count_if_e.cuh
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph_view.hpp>
-#include <matrix_partition_device.cuh>
-#include <patterns/edge_op_utils.cuh>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
-
-#include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
-#include <raft/handle.hpp>
-
-#include <thrust/tuple.h>
-#include <cub/cub.cuh>
-
-#include <cstdint>
-#include <type_traits>
-
-namespace cugraph {
-namespace experimental {
-
-namespace detail {
-
-// FIXME: block size requires tuning
-int32_t constexpr count_if_e_for_all_block_size = 128;
-
-// FIXME: function names conflict if included with transform_reduce_e.cuh
-template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename EdgeOp>
-__global__ void for_all_major_for_all_nbr_low_degree(
-  matrix_partition_device_t<GraphViewType> matrix_partition,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  typename GraphViewType::edge_type* block_counts,
-  EdgeOp e_op)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  auto idx       = static_cast<size_t>(tid);
-
-  edge_t count{0};
-  while (idx < static_cast<size_t>(matrix_partition.get_major_size())) {
-    vertex_t const* indices{nullptr};
-    weight_t const* weights{nullptr};
-    edge_t local_degree{};
-    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx);
-#if 1
-    count += thrust::count_if(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      [&matrix_partition,
-       &adj_matrix_row_value_input_first,
-       &adj_matrix_col_value_input_first,
-       &e_op,
-       idx,
-       indices,
-       weights] __device__(auto i) {
-        auto minor        = indices[i];
-        auto weight       = weights != nullptr ? weights[i] : 1.0;
-        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
-        auto row          = GraphViewType::is_adj_matrix_transposed
-                     ? minor
-                     : matrix_partition.get_major_from_major_offset_nocheck(idx);
-        auto col = GraphViewType::is_adj_matrix_transposed
-                     ? matrix_partition.get_major_from_major_offset_nocheck(idx)
-                     : minor;
-        auto row_offset =
-          GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
-        auto col_offset =
-          GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
-        auto e_op_result = evaluate_edge_op<GraphViewType,
-                                            AdjMatrixRowValueInputIterator,
-                                            AdjMatrixColValueInputIterator,
-                                            EdgeOp>()
-                             .compute(row,
-                                      col,
-                                      weight,
-                                      *(adj_matrix_row_value_input_first + row_offset),
-                                      *(adj_matrix_col_value_input_first + col_offset),
-                                      e_op);
-
-        return e_op_result;
-      });
-#else
-    // FIXME: delete this once we verify that the code above is not slower than this.
-    for (vertex_t i = 0; i < local_degree; ++i) {
-      auto minor        = indices[i];
-      auto weight       = weights != nullptr ? weights[i] : 1.0;
-      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
-      auto row          = GraphViewType::is_adj_matrix_transposed
-                   ? minor
-                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
-      auto col = GraphViewType::is_adj_matrix_transposed
-                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
-                   : minor;
-      auto row_offset =
-        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
-      auto col_offset =
-        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
-      auto e_op_result = evaluate_edge_op<GraphViewType,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
-                                          EdgeOp>()
-                           .compute(row,
-                                    col,
-                                    weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
-                                    e_op);
-      if (e_op_result) { count++; }
-    }
-#endif
-    idx += gridDim.x * blockDim.x;
-  }
-
-  using BlockReduce = cub::BlockReduce<edge_t, count_if_e_for_all_block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  count = BlockReduce(temp_storage).Sum(count);
-  if (threadIdx.x == 0) { *(block_counts + blockIdx.x) = count; }
-}
-
-}  // namespace detail
-
-/**
- * @brief Count the number of edges that satisfies the given predicate.
- *
- * This function is inspired by thrust::count_if().
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns true if this edge should be
- * included in the returned count.
- * @return GraphViewType::edge_type Number of times @p e_op returned true.
- */
-template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename EdgeOp>
-typename GraphViewType::edge_type count_if_e(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  EdgeOp e_op)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-
-  edge_t count{0};
-  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
-
-    if (matrix_partition.get_major_size() > 0) {
-      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? vertex_t{0}
-                                      : matrix_partition.get_major_value_start_offset();
-      auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? matrix_partition.get_major_value_start_offset()
-                                      : vertex_t{0};
-
-      raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
-                                         detail::count_if_e_for_all_block_size,
-                                         handle.get_device_properties().maxGridSize[0]);
-
-      rmm::device_vector<edge_t> block_counts(update_grid.num_blocks);
-
-      detail::for_all_major_for_all_nbr_low_degree<<<update_grid.num_blocks,
-                                                     update_grid.block_size,
-                                                     0,
-                                                     handle.get_stream()>>>(
-        matrix_partition,
-        adj_matrix_row_value_input_first + row_value_input_offset,
-        adj_matrix_col_value_input_first + col_value_input_offset,
-        block_counts.data().get(),
-        e_op);
-
-      // FIXME: we have several options to implement this. With cooperative group support
-      // (https://devblogs.nvidia.com/cooperative-groups/), we can run this synchronization within
-      // the previous kernel. Using atomics at the end of the previous kernel is another option
-      // (sequentialization due to atomics may not be bad as different blocks may reach the
-      // synchronization point in varying timings and the number of SMs is not very big)
-      count += thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                              block_counts.begin(),
-                              block_counts.end(),
-                              edge_t{0},
-                              thrust::plus<edge_t>());
-    }
-  }
-
-  if (GraphViewType::is_multi_gpu) {
-    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
-  }
-
-  return count;
-}
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_e.cuh b/cpp/include/patterns/transform_reduce_e.cuh
deleted file mode 100644
index 797facd4657..00000000000
--- a/cpp/include/patterns/transform_reduce_e.cuh
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph_view.hpp>
-#include <matrix_partition_device.cuh>
-#include <patterns/edge_op_utils.cuh>
-#include <utilities/error.hpp>
-
-#include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
-#include <raft/handle.hpp>
-
-#include <thrust/tuple.h>
-
-#include <cstdint>
-#include <type_traits>
-
-namespace cugraph {
-namespace experimental {
-
-namespace detail {
-
-// FIXME: block size requires tuning
-int32_t constexpr transform_reduce_e_for_all_block_size = 128;
-
-template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename BlockResultIterator,
-          typename EdgeOp>
-__global__ void for_all_major_for_all_nbr_low_degree(
-  matrix_partition_device_t<GraphViewType> matrix_partition,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  BlockResultIterator block_result_first,
-  EdgeOp e_op)
-{
-  using vertex_t      = typename GraphViewType::vertex_type;
-  using edge_t        = typename GraphViewType::edge_type;
-  using weight_t      = typename GraphViewType::weight_type;
-  using e_op_result_t = typename std::iterator_traits<BlockResultIterator>::value_type;
-
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  size_t idx     = static_cast<size_t>(tid);
-
-  e_op_result_t e_op_result_sum{};
-  while (idx < static_cast<size_t>(matrix_partition.get_major_size())) {
-    vertex_t const* indices{nullptr};
-    weight_t const* weights{nullptr};
-    edge_t local_degree{};
-    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx);
-#if 1
-    auto sum = thrust::transform_reduce(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      [&matrix_partition,
-       &adj_matrix_row_value_input_first,
-       &adj_matrix_col_value_input_first,
-       &e_op,
-       idx,
-       indices,
-       weights] __device__(auto i) {
-        auto minor        = indices[i];
-        auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
-        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
-        auto row          = GraphViewType::is_adj_matrix_transposed
-                     ? minor
-                     : matrix_partition.get_major_from_major_offset_nocheck(idx);
-        auto col = GraphViewType::is_adj_matrix_transposed
-                     ? matrix_partition.get_major_from_major_offset_nocheck(idx)
-                     : minor;
-        auto row_offset =
-          GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
-        auto col_offset =
-          GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
-        return evaluate_edge_op<GraphViewType,
-                                AdjMatrixRowValueInputIterator,
-                                AdjMatrixColValueInputIterator,
-                                EdgeOp>()
-          .compute(row,
-                   col,
-                   weight,
-                   *(adj_matrix_row_value_input_first + row_offset),
-                   *(adj_matrix_col_value_input_first + col_offset),
-                   e_op);
-      },
-      e_op_result_t{},
-      [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); });
-
-    e_op_result_sum = plus_edge_op_result(e_op_result_sum, sum);
-#else
-    // FIXME: delete this once we verify that the code above is not slower than this.
-    for (vertex_t i = 0; i < local_degree; ++i) {
-      auto minor        = indices[i];
-      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
-      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
-      auto row          = GraphViewType::is_adj_matrix_transposed
-                   ? minor
-                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
-      auto col = GraphViewType::is_adj_matrix_transposed
-                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
-                   : minor;
-      auto row_offset =
-        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
-      auto col_offset =
-        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
-      auto e_op_result = evaluate_edge_op<GraphViewType,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
-                                          EdgeOp>()
-                           .compute(row,
-                                    col,
-                                    weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
-                                    e_op);
-      e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result);
-    }
-#endif
-    idx += gridDim.x * blockDim.x;
-  }
-
-  e_op_result_sum =
-    block_reduce_edge_op_result<e_op_result_t, transform_reduce_e_for_all_block_size>().compute(
-      e_op_result_sum);
-  if (threadIdx.x == 0) { *(block_result_first + blockIdx.x) = e_op_result_sum; }
-}
-
-}  // namespace detail
-
-/**
- * @brief Iterate over the entire set of edges and reduce @p edge_op outputs.
- *
- * This function is inspired by thrust::transform_reduce().
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
- * @tparam T Type of the initial value.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
- * @param init Initial value to be added to the transform-reduced input vertex properties.
- * @return T Reduction of the @p edge_op outputs.
- */
-template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename EdgeOp,
-          typename T>
-T transform_reduce_e(raft::handle_t const& handle,
-                     GraphViewType const& graph_view,
-                     AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-                     AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-                     EdgeOp e_op,
-                     T init)
-{
-  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-
-  using vertex_t = typename GraphViewType::vertex_type;
-
-  T result{};
-  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
-
-    if (matrix_partition.get_major_size() > 0) {
-      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? vertex_t{0}
-                                      : matrix_partition.get_major_value_start_offset();
-      auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? matrix_partition.get_major_value_start_offset()
-                                      : vertex_t{0};
-
-      raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
-                                         detail::transform_reduce_e_for_all_block_size,
-                                         handle.get_device_properties().maxGridSize[0]);
-
-      rmm::device_vector<T> block_results(update_grid.num_blocks);
-
-      detail::for_all_major_for_all_nbr_low_degree<<<update_grid.num_blocks,
-                                                     update_grid.block_size,
-                                                     0,
-                                                     handle.get_stream()>>>(
-        matrix_partition,
-        adj_matrix_row_value_input_first + row_value_input_offset,
-        adj_matrix_col_value_input_first + col_value_input_offset,
-        block_results.data(),
-        e_op);
-
-      // FIXME: we have several options to implement this. With cooperative group support
-      // (https://devblogs.nvidia.com/cooperative-groups/), we can run this synchronization within
-      // the previous kernel. Using atomics at the end of the previous kernel is another option
-      // (sequentialization due to atomics may not be bad as different blocks may reach the
-      // synchronization point in varying timings and the number of SMs is not very big)
-      auto partial_result =
-        thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                       block_results.begin(),
-                       block_results.end(),
-                       T(),
-                       [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); });
-
-      result = plus_edge_op_result(result, partial_result);
-    }
-  }
-
-  if (GraphViewType::is_multi_gpu) {
-    result = host_scalar_allreduce(handle.get_comms(), result, handle.get_stream());
-  }
-
-  return plus_edge_op_result(init, result);
-}
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh b/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
deleted file mode 100644
index f5af03d647c..00000000000
--- a/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
-
-#include <raft/handle.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/transform_reduce.h>
-
-namespace cugraph {
-namespace experimental {
-
-/**
- * @brief Apply an operator to the matching vertex and adjacency matrix row properties and reduce.
- *
- * i'th vertex matches with the i'th row in the graph adjacency matrix. @p v_op takes vertex
- * properties and adjacency matrix row properties for the matching row, and @p v_op outputs are
- * reduced. This function is inspired by thrust::transform_reduce().
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
- * @tparam VertexOp Type of the binary vertex operator.
- * @tparam T Type of the initial value.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
- * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
- * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param v_op Binary operator takes *(@p vertex_value_input_first + i) and *(@p
- * adj_matrix_row_value_input_first + j) (where i and j are set for a vertex and the matching row)
- * and returns a transformed value to be reduced.
- * @param init Initial value to be added to the transform-reduced input vertex properties.
- * @return T Reduction of the @p v_op outputs.
- */
-template <typename GraphViewType,
-          typename VertexValueInputIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename VertexOp,
-          typename T>
-T transform_reduce_v_with_adj_matrix_row(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  VertexValueInputIterator vertex_value_input_first,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  VertexOp v_op,
-  T init)
-{
-  T ret{};
-
-  auto vertex_first = graph_view.get_local_vertex_first();
-  auto vertex_last  = graph_view.get_local_vertex_last();
-  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    auto row_first = graph_view.get_local_adj_matrix_partition_row_first(i);
-    auto row_last  = graph_view.get_local_adj_matrix_partition_row_last(i);
-
-    auto range_first = std::max(vertex_first, row_first);
-    auto range_last  = std::min(vertex_last, row_last);
-
-    if (range_last > range_first) {
-      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
-      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? 0
-                                      : matrix_partition.get_major_value_start_offset();
-
-      auto input_first  = thrust::make_zip_iterator(thrust::make_tuple(
-        vertex_value_input_first + (range_first - vertex_first),
-        adj_matrix_row_value_input_first + row_value_input_offset + (range_first - row_first)));
-      auto v_op_wrapper = [v_op] __device__(auto v_and_row_val) {
-        return v_op(thrust::get<0>(v_and_row_val), thrust::get<1>(v_and_row_val));
-      };
-      ret +=
-        thrust::transform_reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                                 input_first,
-                                 input_first + (range_last - range_first),
-                                 v_op_wrapper,
-                                 T{},
-                                 thrust::plus<T>());
-    }
-  }
-
-  if (GraphViewType::is_multi_gpu) {
-    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
-  }
-
-  return init + ret;
-}
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
deleted file mode 100644
index a2250482c68..00000000000
--- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cstdlib>
-#include <experimental/graph_view.hpp>
-#include <matrix_partition_device.cuh>
-#include <partition_manager.hpp>
-#include <patterns/edge_op_utils.cuh>
-#include <patterns/reduce_op.cuh>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
-#include <utilities/thrust_tuple_utils.cuh>
-#include <vertex_partition_device.cuh>
-
-#include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
-#include <raft/handle.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/tuple.h>
-#include <thrust/type_traits/integer_sequence.h>
-#include <cub/cub.cuh>
-
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace cugraph {
-namespace experimental {
-
-namespace detail {
-
-// FIXME: block size requires tuning
-int32_t constexpr update_frontier_v_push_if_out_nbr_for_all_block_size = 128;
-int32_t constexpr update_frontier_v_push_if_out_nbr_update_block_size  = 128;
-
-template <typename GraphViewType,
-          typename RowIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename BufferKeyOutputIterator,
-          typename BufferPayloadOutputIterator,
-          typename EdgeOp>
-__global__ void for_all_frontier_row_for_all_nbr_low_degree(
-  matrix_partition_device_t<GraphViewType> matrix_partition,
-  RowIterator row_first,
-  RowIterator row_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  BufferKeyOutputIterator buffer_key_output_first,
-  BufferPayloadOutputIterator buffer_payload_output_first,
-  size_t* buffer_idx_ptr,
-  EdgeOp e_op)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-
-  static_assert(!GraphViewType::is_adj_matrix_transposed,
-                "GraphViewType should support the push model.");
-
-  auto num_rows  = static_cast<size_t>(thrust::distance(row_first, row_last));
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  size_t idx     = tid;
-
-  while (idx < num_rows) {
-    vertex_t row    = *(row_first + idx);
-    auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
-    vertex_t const* indices{nullptr};
-    weight_t const* weights{nullptr};
-    edge_t local_out_degree{};
-    thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset);
-    for (vertex_t i = 0; i < local_out_degree; ++i) {
-      auto col         = indices[i];
-      auto weight      = weights != nullptr ? weights[i] : 1.0;
-      auto col_offset  = matrix_partition.get_minor_offset_from_minor_nocheck(col);
-      auto e_op_result = evaluate_edge_op<GraphViewType,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
-                                          EdgeOp>()
-                           .compute(row,
-                                    col,
-                                    weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
-                                    e_op);
-      if (thrust::get<0>(e_op_result) == true) {
-        // FIXME: This atomicAdd serializes execution. If we renumber vertices to insure that rows
-        // within a partition are sorted by their out-degree in decreasing order, we can compute
-        // a tight uppper bound for the maximum number of pushes per warp/block and use shared
-        // memory buffer to reduce the number of atomicAdd operations.
-        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-        auto buffer_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
-                                    static_cast<unsigned long long int>(1));
-        *(buffer_key_output_first + buffer_idx) = col;
-        *(buffer_payload_output_first + buffer_idx) =
-          remove_first_thrust_tuple_element<decltype(e_op_result)>()(e_op_result);
-      }
-    }
-
-    idx += gridDim.x * blockDim.x;
-  }
-}
-
-template <typename BufferKeyOutputIterator, typename BufferPayloadOutputIterator, typename ReduceOp>
-size_t reduce_buffer_elements(raft::handle_t const& handle,
-                              BufferKeyOutputIterator buffer_key_output_first,
-                              BufferPayloadOutputIterator buffer_payload_output_first,
-                              size_t num_buffer_elements,
-                              ReduceOp reduce_op)
-{
-  thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                      buffer_key_output_first,
-                      buffer_key_output_first + num_buffer_elements,
-                      buffer_payload_output_first);
-
-  if (std::is_same<ReduceOp, reduce_op::any<typename ReduceOp::type>>::value) {
-    // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard
-    // non-first elements)
-    auto it = thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                                    buffer_key_output_first,
-                                    buffer_key_output_first + num_buffer_elements,
-                                    buffer_payload_output_first);
-    return static_cast<size_t>(thrust::distance(buffer_key_output_first, thrust::get<0>(it)));
-  } else {
-    using key_t     = typename std::iterator_traits<BufferKeyOutputIterator>::value_type;
-    using payload_t = typename std::iterator_traits<BufferPayloadOutputIterator>::value_type;
-    // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt
-    // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we
-    // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and
-    // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the
-    // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the
-    // system HBM size or a function of the maximum number of threads in the system))
-    // FIXME: actually, we can find how many unique keys are here by now.
-    // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the
-    // vertex unless reduce_op is a pure function.
-    rmm::device_vector<key_t> keys(num_buffer_elements);
-    rmm::device_vector<payload_t> values(num_buffer_elements);
-    auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                                    buffer_key_output_first,
-                                    buffer_key_output_first + num_buffer_elements,
-                                    buffer_payload_output_first,
-                                    keys.begin(),
-                                    values.begin(),
-                                    thrust::equal_to<key_t>(),
-                                    reduce_op);
-    auto num_reduced_buffer_elements =
-      static_cast<size_t>(thrust::distance(keys.begin(), thrust::get<0>(it)));
-    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                 keys.begin(),
-                 keys.begin() + num_reduced_buffer_elements,
-                 buffer_key_output_first);
-    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                 values.begin(),
-                 values.begin() + num_reduced_buffer_elements,
-                 buffer_payload_output_first);
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // this is necessary as kyes & values will become out-of-scope once
-                              // this function returns
-    return num_reduced_buffer_elements;
-  }
-}
-
-template <size_t num_buckets,
-          typename GraphViewType,
-          typename BufferKeyInputIterator,
-          typename BufferPayloadInputIterator,
-          typename VertexValueInputIterator,
-          typename VertexValueOutputIterator,
-          typename vertex_t,
-          typename VertexOp>
-__global__ void update_frontier_and_vertex_output_values(
-  vertex_partition_device_t<GraphViewType> vertex_partition,
-  BufferKeyInputIterator buffer_key_input_first,
-  BufferPayloadInputIterator buffer_payload_input_first,
-  size_t num_buffer_elements,
-  VertexValueInputIterator vertex_value_input_first,
-  VertexValueOutputIterator vertex_value_output_first,
-  vertex_t** bucket_ptrs,
-  size_t* bucket_sizes_ptr,
-  size_t invalid_bucket_idx,
-  vertex_t invalid_vertex,
-  VertexOp v_op)
-{
-  static_assert(std::is_same<typename std::iterator_traits<BufferKeyInputIterator>::value_type,
-                             vertex_t>::value);
-  auto const tid   = threadIdx.x + blockIdx.x * blockDim.x;
-  size_t idx       = tid;
-  size_t block_idx = blockIdx.x;
-  // FIXME: it might be more performant to process more than one element per thread
-  auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x;
-
-  using BlockScan =
-    cub::BlockScan<size_t, detail::update_frontier_v_push_if_out_nbr_update_block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-
-  __shared__ size_t bucket_block_start_offsets[num_buckets];
-
-  size_t bucket_block_local_offsets[num_buckets];
-  size_t bucket_block_aggregate_sizes[num_buckets];
-
-  while (block_idx < num_blocks) {
-    for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; }
-
-    size_t selected_bucket_idx{invalid_bucket_idx};
-    vertex_t key{invalid_vertex};
-
-    if (idx < num_buffer_elements) {
-      key                 = *(buffer_key_input_first + idx);
-      auto key_offset     = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key);
-      auto v_val          = *(vertex_value_input_first + key_offset);
-      auto payload        = *(buffer_payload_input_first + idx);
-      auto v_op_result    = v_op(v_val, payload);
-      selected_bucket_idx = thrust::get<0>(v_op_result);
-      if (selected_bucket_idx != invalid_bucket_idx) {
-        *(vertex_value_output_first + key_offset) =
-          remove_first_thrust_tuple_element<decltype(v_op_result)>()(v_op_result);
-        bucket_block_local_offsets[selected_bucket_idx] = 1;
-      }
-    }
-
-    for (size_t i = 0; i < num_buckets; ++i) {
-      BlockScan(temp_storage)
-        .ExclusiveSum(bucket_block_local_offsets[i],
-                      bucket_block_local_offsets[i],
-                      bucket_block_aggregate_sizes[i]);
-    }
-
-    if (threadIdx.x == 0) {
-      for (size_t i = 0; i < num_buckets; ++i) {
-        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-        bucket_block_start_offsets[i] =
-          atomicAdd(reinterpret_cast<unsigned long long int*>(bucket_sizes_ptr + i),
-                    static_cast<unsigned long long int>(bucket_block_aggregate_sizes[i]));
-      }
-    }
-
-    __syncthreads();
-
-    // FIXME: better use shared memory buffer to aggreaget global memory writes
-    if (selected_bucket_idx != invalid_bucket_idx) {
-      bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] +
-                                       bucket_block_local_offsets[selected_bucket_idx]] = key;
-    }
-
-    idx += gridDim.x * blockDim.x;
-    block_idx += gridDim.x;
-  }
-}
-
-}  // namespace detail
-
-/**
- * @brief Update vertex frontier and vertex property values iterating over the outgoing edges.
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam VertexIterator Type of the iterator for vertex identifiers.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
- * @tparam ReduceOp Type of the binary reduction operator.
- * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
- * @tparam VertexValueOutputIterator Type of the iterator for vertex property variables.
- * @tparam VertexFrontierType Type of the vertex frontier class which abstracts vertex frontier
- * managements.
- * @tparam VertexOp Type of the binary vertex operator.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param vertex_first Iterator pointing to the first (inclusive) vertex in the current frontier. v
- * in [vertex_first, vertex_last) should be distinct (and should belong to this process in
- * multi-GPU), otherwise undefined behavior
- * @param vertex_last Iterator pointing to the last (exclusive) vertex in the current frontier.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a value to reduced by the @p
- * reduce_op.
- * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
- * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
- * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
- * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
- * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
- * (exclusive) is deduced as @p vertex_value_output_first + @p
- * graph_view.get_number_of_local_vertices().
- * @param vertex_frontier vertex frontier class object for vertex frontier managements. This object
- * includes multiple bucket objects.
- * @param v_op Binary operator takes *(@p vertex_value_input_first + i) (where i is [0, @p
- * graph_view.get_number_of_local_vertices())) and reduced value of the @p e_op outputs for
- * this vertex and returns the target bucket index (for frontier update) and new verrtex property
- * values (to update *(@p vertex_value_output_first + i)).
- */
-template <typename GraphViewType,
-          typename VertexIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename EdgeOp,
-          typename ReduceOp,
-          typename VertexValueInputIterator,
-          typename VertexValueOutputIterator,
-          typename VertexFrontierType,
-          typename VertexOp>
-void update_frontier_v_push_if_out_nbr(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  VertexIterator vertex_first,
-  VertexIterator vertex_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  EdgeOp e_op,
-  ReduceOp reduce_op,
-  VertexValueInputIterator vertex_value_input_first,
-  VertexValueOutputIterator vertex_value_output_first,
-  VertexFrontierType& vertex_frontier,
-  VertexOp v_op)
-{
-  static_assert(!GraphViewType::is_adj_matrix_transposed,
-                "GraphViewType should support the push model.");
-
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-
-  // 1. fill the buffer
-
-  vertex_frontier.set_buffer_idx_value(0);
-
-  auto loop_count = size_t{1};
-  if (GraphViewType::is_multi_gpu) {
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
-    loop_count               = graph_view.is_hypergraph_partitioned()
-                   ? graph_view.get_number_of_local_adj_matrix_partitions()
-                   : static_cast<size_t>(row_comm_size);
-  }
-
-  for (size_t i = 0; i < loop_count; ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(
-      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
-
-    rmm::device_uvector<vertex_t> frontier_rows(
-      0, handle.get_stream());  // relevant only if GraphViewType::is_multi_gpu is true
-
-    size_t frontier_size{};
-    if (GraphViewType::is_multi_gpu) {
-      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-
-      auto sub_comm_rank = graph_view.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank;
-      frontier_size      = host_scalar_bcast(
-        graph_view.is_hypergraph_partitioned() ? col_comm : row_comm,
-        (static_cast<size_t>(sub_comm_rank) == i) ? thrust::distance(vertex_first, vertex_last)
-                                                  : size_t{0},
-        i,
-        handle.get_stream());
-      if (static_cast<size_t>(sub_comm_rank) != i) {
-        frontier_rows.resize(frontier_size, handle.get_stream());
-      }
-      device_bcast(graph_view.is_hypergraph_partitioned() ? col_comm : row_comm,
-                   vertex_first,
-                   frontier_rows.begin(),
-                   frontier_size,
-                   i,
-                   handle.get_stream());
-    } else {
-      frontier_size = thrust::distance(vertex_first, vertex_last);
-    }
-
-    edge_t max_pushes =
-      frontier_size > 0
-        ? frontier_rows.size() > 0
-            ? thrust::transform_reduce(
-                rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                frontier_rows.begin(),
-                frontier_rows.end(),
-                [matrix_partition] __device__(auto row) {
-                  auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
-                  return matrix_partition.get_local_degree(row_offset);
-                },
-                edge_t{0},
-                thrust::plus<edge_t>())
-            : thrust::transform_reduce(
-                rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                vertex_first,
-                vertex_last,
-                [matrix_partition] __device__(auto row) {
-                  auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
-                  return matrix_partition.get_local_degree(row_offset);
-                },
-                edge_t{0},
-                thrust::plus<edge_t>())
-        : edge_t{0};
-
-    // FIXME: This is highly pessimistic for single GPU (and multi-GPU as well if we maintain
-    // additional per column data for filtering in e_op). If we can pause & resume execution if
-    // buffer needs to be increased (and if we reserve address space to avoid expensive
-    // reallocation;
-    // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management/), we can
-    // start with a smaller buffer size (especially when the frontier size is large).
-    // for special cases when we can assure that there is no more than one push per destination
-    // (e.g. if cugraph::experimental::reduce_op::any is used), we can limit the buffer size to
-    // std::min(max_pushes, matrix_partition.get_minor_size()).
-    // For Volta+, we can limit the buffer size to std::min(max_pushes,
-    // matrix_partition.get_minor_size()) if the reduction operation is a pure function if we use
-    // locking.
-    // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature
-    // to reserve address space.
-    vertex_frontier.resize_buffer(vertex_frontier.get_buffer_idx_value() + max_pushes);
-    auto buffer_first         = vertex_frontier.buffer_begin();
-    auto buffer_key_first     = std::get<0>(buffer_first);
-    auto buffer_payload_first = std::get<1>(buffer_first);
-
-    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                    ? vertex_t{0}
-                                    : matrix_partition.get_major_value_start_offset();
-
-    // FIXME: This is highly inefficeint for graphs with high-degree vertices. If we renumber
-    // vertices to insure that rows within a partition are sorted by their out-degree in decreasing
-    // order, we will apply this kernel only to low out-degree vertices.
-    if (frontier_size > 0) {
-      raft::grid_1d_thread_t for_all_low_degree_grid(
-        frontier_size,
-        detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
-        handle.get_device_properties().maxGridSize[0]);
-
-      if (frontier_rows.size() > 0) {
-        detail::for_all_frontier_row_for_all_nbr_low_degree<<<for_all_low_degree_grid.num_blocks,
-                                                              for_all_low_degree_grid.block_size,
-                                                              0,
-                                                              handle.get_stream()>>>(
-          matrix_partition,
-          frontier_rows.begin(),
-          frontier_rows.end(),
-          adj_matrix_row_value_input_first + row_value_input_offset,
-          adj_matrix_col_value_input_first,
-          buffer_key_first,
-          buffer_payload_first,
-          vertex_frontier.get_buffer_idx_ptr(),
-          e_op);
-      } else {
-        detail::for_all_frontier_row_for_all_nbr_low_degree<<<for_all_low_degree_grid.num_blocks,
-                                                              for_all_low_degree_grid.block_size,
-                                                              0,
-                                                              handle.get_stream()>>>(
-          matrix_partition,
-          vertex_first,
-          vertex_last,
-          adj_matrix_row_value_input_first + row_value_input_offset,
-          adj_matrix_col_value_input_first,
-          buffer_key_first,
-          buffer_payload_first,
-          vertex_frontier.get_buffer_idx_ptr(),
-          e_op);
-      }
-    }
-  }
-
-  // 2. reduce the buffer
-
-  auto num_buffer_offset = edge_t{0};
-
-  auto buffer_first         = vertex_frontier.buffer_begin();
-  auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
-  auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
-
-  auto num_buffer_elements = detail::reduce_buffer_elements(handle,
-                                                            buffer_key_first,
-                                                            buffer_payload_first,
-                                                            vertex_frontier.get_buffer_idx_value(),
-                                                            reduce_op);
-
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm               = handle.get_comms();
-    auto const comm_rank     = comm.get_rank();
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_rank = row_comm.get_rank();
-    auto const row_comm_size = row_comm.get_size();
-    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-    auto const col_comm_rank = col_comm.get_rank();
-    auto const col_comm_size = col_comm.get_size();
-
-    std::vector<vertex_t> h_vertex_lasts(graph_view.is_hypergraph_partitioned() ? row_comm_size
-                                                                                : col_comm_size);
-    for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
-      h_vertex_lasts[i] = graph_view.get_vertex_partition_last(
-        graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i
-                                               : row_comm_rank * col_comm_size + i);
-    }
-
-    rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
-    raft::update_device(
-      d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
-    rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
-                                                            handle.get_stream());
-    thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                        buffer_key_first,
-                        buffer_key_first + num_buffer_elements,
-                        d_vertex_lasts.begin(),
-                        d_vertex_lasts.end(),
-                        d_tx_buffer_last_boundaries.begin());
-    std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
-    raft::update_host(h_tx_buffer_last_boundaries.data(),
-                      d_tx_buffer_last_boundaries.data(),
-                      d_tx_buffer_last_boundaries.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-    std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
-    std::adjacent_difference(
-      h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
-
-    std::vector<size_t> rx_counts(graph_view.is_hypergraph_partitioned() ? row_comm_size
-                                                                         : col_comm_size);
-    std::vector<raft::comms::request_t> count_requests(tx_counts.size() + rx_counts.size());
-    size_t tx_self_i = std::numeric_limits<size_t>::max();
-    for (size_t i = 0; i < tx_counts.size(); ++i) {
-      auto comm_dst_rank = graph_view.is_hypergraph_partitioned()
-                             ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                             : row_comm_rank * col_comm_size + static_cast<int>(i);
-      if (comm_dst_rank == comm_rank) {
-        tx_self_i = i;
-        // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms
-        count_requests[i] = std::numeric_limits<raft::comms::request_t>::max();
-      } else {
-        comm.isend(&tx_counts[i], 1, comm_dst_rank, 0 /* tag */, count_requests.data() + i);
-      }
-    }
-    for (size_t i = 0; i < rx_counts.size(); ++i) {
-      auto comm_src_rank = graph_view.is_hypergraph_partitioned()
-                             ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                             : static_cast<int>(i) * row_comm_size + comm_rank / col_comm_size;
-      if (comm_src_rank == comm_rank) {
-        assert(tx_self_i != std::numeric_limits<size_t>::max());
-        rx_counts[i] = tx_counts[tx_self_i];
-        // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms
-        count_requests[tx_counts.size() + i] = std::numeric_limits<raft::comms::request_t>::max();
-      } else {
-        comm.irecv(&rx_counts[i],
-                   1,
-                   comm_src_rank,
-                   0 /* tag */,
-                   count_requests.data() + tx_counts.size() + i);
-      }
-    }
-    // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms, if
-    // raft::comms::wait immediately returns on seeing request_null, this remove is unnecessary
-    count_requests.erase(std::remove(count_requests.begin(),
-                                     count_requests.end(),
-                                     std::numeric_limits<raft::comms::request_t>::max()),
-                         count_requests.end());
-    comm.waitall(count_requests.size(), count_requests.data());
-
-    std::vector<size_t> tx_offsets(tx_counts.size() + 1, edge_t{0});
-    std::partial_sum(tx_counts.begin(), tx_counts.end(), tx_offsets.begin() + 1);
-    std::vector<size_t> rx_offsets(rx_counts.size() + 1, edge_t{0});
-    std::partial_sum(rx_counts.begin(), rx_counts.end(), rx_offsets.begin() + 1);
-
-    // FIXME: this will require costly reallocation if we don't use the new CUDA feature to reserve
-    // address space.
-    // FIXME: std::max(actual size, 1) as ncclRecv currently hangs if recvuff is nullptr even if
-    // count is 0
-    vertex_frontier.resize_buffer(std::max(num_buffer_elements + rx_offsets.back(), size_t(1)));
-
-    auto buffer_first         = vertex_frontier.buffer_begin();
-    auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
-    auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
-
-    std::vector<int> tx_dst_ranks(tx_counts.size());
-    std::vector<int> rx_src_ranks(rx_counts.size());
-    for (size_t i = 0; i < tx_dst_ranks.size(); ++i) {
-      tx_dst_ranks[i] = graph_view.is_hypergraph_partitioned()
-                          ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                          : row_comm_rank * col_comm_size + static_cast<int>(i);
-    }
-    for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
-      rx_src_ranks[i] = graph_view.is_hypergraph_partitioned()
-                          ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                          : static_cast<int>(i) * row_comm_size + comm_rank / col_comm_size;
-    }
-
-    device_multicast_sendrecv<decltype(buffer_key_first), decltype(buffer_key_first)>(
-      comm,
-      buffer_key_first,
-      tx_counts,
-      tx_offsets,
-      tx_dst_ranks,
-      buffer_key_first + num_buffer_elements,
-      rx_counts,
-      rx_offsets,
-      rx_src_ranks,
-      handle.get_stream());
-    device_multicast_sendrecv<decltype(buffer_payload_first), decltype(buffer_payload_first)>(
-      comm,
-      buffer_payload_first,
-      tx_counts,
-      tx_offsets,
-      tx_dst_ranks,
-      buffer_payload_first + num_buffer_elements,
-      rx_counts,
-      rx_offsets,
-      rx_src_ranks,
-      handle.get_stream());
-
-    // FIXME: this does not exploit the fact that each segment is sorted. Lost performance
-    // optimization opportunities.
-    // FIXME: we can use [vertex_frontier.buffer_begin(), vertex_frontier.buffer_begin() +
-    // num_buffer_elements) as temporary buffer inside reduce_buffer_elements().
-    num_buffer_offset   = num_buffer_elements;
-    num_buffer_elements = detail::reduce_buffer_elements(handle,
-                                                         buffer_key_first + num_buffer_elements,
-                                                         buffer_payload_first + num_buffer_elements,
-                                                         rx_offsets.back(),
-                                                         reduce_op);
-  }
-
-  // 3. update vertex properties
-
-  if (num_buffer_elements > 0) {
-    auto buffer_first         = vertex_frontier.buffer_begin();
-    auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
-    auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
-
-    raft::grid_1d_thread_t update_grid(num_buffer_elements,
-                                       detail::update_frontier_v_push_if_out_nbr_update_block_size,
-                                       handle.get_device_properties().maxGridSize[0]);
-
-    auto constexpr invalid_vertex = invalid_vertex_id<vertex_t>::value;
-
-    vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
-
-    auto bucket_and_bucket_size_device_ptrs =
-      vertex_frontier.get_bucket_and_bucket_size_device_pointers();
-    detail::update_frontier_and_vertex_output_values<VertexFrontierType::kNumBuckets>
-      <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-        vertex_partition,
-        buffer_key_first,
-        buffer_payload_first,
-        num_buffer_elements,
-        vertex_value_input_first,
-        vertex_value_output_first,
-        std::get<0>(bucket_and_bucket_size_device_ptrs).get(),
-        std::get<1>(bucket_and_bucket_size_device_ptrs).get(),
-        VertexFrontierType::kInvalidBucketIdx,
-        invalid_vertex,
-        v_op);
-
-    auto bucket_sizes_device_ptr = std::get<1>(bucket_and_bucket_size_device_ptrs);
-    thrust::host_vector<size_t> bucket_sizes(
-      bucket_sizes_device_ptr, bucket_sizes_device_ptr + VertexFrontierType::kNumBuckets);
-    for (size_t i = 0; i < VertexFrontierType::kNumBuckets; ++i) {
-      vertex_frontier.get_bucket(i).set_size(bucket_sizes[i]);
-    }
-  }
-}
-
-/*
-
-FIXME:
-
-iterating over lower triangular (or upper triangular) : triangle counting
-LRB might be necessary if the cost of processing an edge (i, j) is a function of degree(i) and
-degree(j) : triangle counting
-push-pull switching support (e.g. DOBFS), in this case, we need both
-CSR & CSC (trade-off execution time vs memory requirement, unless graph is symmetric)
-if graph is symmetric, there will be additional optimization opportunities (e.g. in-degree ==
-out-degree) For BFS, sending a bit vector (for the entire set of dest vertices per partitoin may
-work better we can use thrust::set_intersection for triangle counting think about adding thrust
-wrappers for reduction functions. Can I pass nullptr for dummy
-instead of thrust::make_counting_iterator(0)?
-*/
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh
deleted file mode 100644
index ccb9e1a5a0d..00000000000
--- a/cpp/include/patterns/vertex_frontier.cuh
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
-#include <utilities/thrust_tuple_utils.cuh>
-
-#include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
-#include <raft/handle.hpp>
-#include <rmm/device_scalar.hpp>
-
-#include <thrust/host_vector.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/tuple.h>
-
-#include <cinttypes>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-namespace cugraph {
-namespace experimental {
-
-namespace detail {
-
-// FIXME: block size requires tuning
-int32_t constexpr move_and_invalidate_if_block_size = 128;
-
-// FIXME: better move to another file for reusability
-inline size_t round_up(size_t number_to_round, size_t modulus)
-{
-  return ((number_to_round + (modulus - 1)) / modulus) * modulus;
-}
-
-template <typename TupleType, typename vertex_t, size_t... Is>
-auto make_buffer_zip_iterator_impl(std::vector<void*>& buffer_ptrs,
-                                   size_t offset,
-                                   std::index_sequence<Is...>)
-{
-  auto key_ptr    = reinterpret_cast<vertex_t*>(buffer_ptrs[0]) + offset;
-  auto payload_it = thrust::make_zip_iterator(
-    thrust::make_tuple(reinterpret_cast<typename thrust::tuple_element<Is, TupleType>::type*>(
-      buffer_ptrs[1 + Is])...));
-  return std::make_tuple(key_ptr, payload_it);
-}
-
-template <typename TupleType, typename vertex_t>
-auto make_buffer_zip_iterator(std::vector<void*>& buffer_ptrs, size_t offset)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
-  return make_buffer_zip_iterator_impl<TupleType, vertex_t>(
-    buffer_ptrs, offset, std::make_index_sequence<tuple_size>());
-}
-
-template <size_t num_buckets, typename RowIterator, typename vertex_t, typename SplitOp>
-__global__ void move_and_invalidate_if(RowIterator row_first,
-                                       RowIterator row_last,
-                                       vertex_t** bucket_ptrs,
-                                       size_t* bucket_sizes_ptr,
-                                       size_t this_bucket_idx,
-                                       size_t invalid_bucket_idx,
-                                       vertex_t invalid_vertex,
-                                       SplitOp split_op)
-{
-  static_assert(
-    std::is_same<typename std::iterator_traits<RowIterator>::value_type, vertex_t>::value);
-  auto const tid    = threadIdx.x + blockIdx.x * blockDim.x;
-  size_t idx        = tid;
-  size_t block_idx  = blockIdx.x;
-  auto num_elements = thrust::distance(row_first, row_last);
-  // FIXME: it might be more performant to process more than one element per thread
-  auto num_blocks = (num_elements + blockDim.x - 1) / blockDim.x;
-
-  using BlockScan = cub::BlockScan<size_t, move_and_invalidate_if_block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-
-  __shared__ size_t bucket_block_start_offsets[num_buckets];
-
-  size_t bucket_block_local_offsets[num_buckets];
-  size_t bucket_block_aggregate_sizes[num_buckets];
-
-  while (block_idx < num_blocks) {
-    for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; }
-
-    size_t selected_bucket_idx{invalid_bucket_idx};
-    vertex_t key{invalid_vertex};
-
-    if (idx < num_elements) {
-      key                 = *(row_first + idx);
-      selected_bucket_idx = split_op(key);
-      if (selected_bucket_idx != this_bucket_idx) {
-        *(row_first + idx) = invalid_vertex;
-        if (selected_bucket_idx != invalid_bucket_idx) {
-          bucket_block_local_offsets[selected_bucket_idx] = 1;
-        }
-      }
-    }
-
-    for (size_t i = 0; i < num_buckets; ++i) {
-      BlockScan(temp_storage)
-        .ExclusiveSum(bucket_block_local_offsets[i],
-                      bucket_block_local_offsets[i],
-                      bucket_block_aggregate_sizes[i]);
-    }
-
-    if (threadIdx.x == 0) {
-      for (size_t i = 0; i < num_buckets; ++i) {
-        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-        bucket_block_start_offsets[i] =
-          atomicAdd(reinterpret_cast<unsigned long long int*>(bucket_sizes_ptr + i),
-                    static_cast<unsigned long long int>(bucket_block_aggregate_sizes[i]));
-      }
-    }
-
-    __syncthreads();
-
-    // FIXME: better use shared memory buffer to aggreaget global memory writes
-    if ((selected_bucket_idx != this_bucket_idx) && (selected_bucket_idx != invalid_bucket_idx)) {
-      bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] +
-                                       bucket_block_local_offsets[selected_bucket_idx]] = key;
-    }
-
-    idx += gridDim.x * blockDim.x;
-    block_idx += gridDim.x;
-  }
-}
-
-}  // namespace detail
-
-template <typename vertex_t, bool is_multi_gpu = false>
-class Bucket {
- public:
-  Bucket(raft::handle_t const& handle, size_t capacity)
-    : handle_ptr_(&handle), elements_(capacity, invalid_vertex_id<vertex_t>::value)
-  {
-  }
-
-  void insert(vertex_t v)
-  {
-    elements_[size_] = v;
-    ++size_;
-  }
-
-  size_t size() const { return size_; }
-
-  void set_size(size_t size) { size_ = size; }
-
-  template <bool do_aggregate = is_multi_gpu>
-  std::enable_if_t<do_aggregate, size_t> aggregate_size() const
-  {
-    return host_scalar_allreduce(handle_ptr_->get_comms(), size_, handle_ptr_->get_stream());
-  }
-
-  template <bool do_aggregate = is_multi_gpu>
-  std::enable_if_t<!do_aggregate, size_t> aggregate_size() const
-  {
-    return size_;
-  }
-
-  void clear() { size_ = 0; }
-
-  size_t capacity() const { return elements_.size(); }
-
-  auto const data() const { return elements_.data().get(); }
-
-  auto data() { return elements_.data().get(); }
-
-  auto const begin() const { return elements_.begin(); }
-
-  auto begin() { return elements_.begin(); }
-
-  auto const end() const { return elements_.begin() + size_; }
-
-  auto end() { return elements_.begin() + size_; }
-
- private:
-  raft::handle_t const* handle_ptr_{nullptr};
-  rmm::device_vector<vertex_t> elements_{};
-  size_t size_{0};
-};
-
-template <typename ReduceInputTupleType,
-          typename vertex_t,
-          bool is_multi_gpu  = false,
-          size_t num_buckets = 1>
-class VertexFrontier {
- public:
-  static size_t constexpr kNumBuckets = num_buckets;
-  static size_t constexpr kInvalidBucketIdx{std::numeric_limits<size_t>::max()};
-
-  VertexFrontier(raft::handle_t const& handle, std::vector<size_t> bucket_capacities)
-    : handle_ptr_(&handle),
-      tmp_bucket_ptrs_(num_buckets, nullptr),
-      tmp_bucket_sizes_(num_buckets, 0),
-      buffer_ptrs_(kReduceInputTupleSize + 1 /* to store destination column number */, nullptr),
-      buffer_idx_(0, handle_ptr_->get_stream())
-  {
-    CUGRAPH_EXPECTS(bucket_capacities.size() == num_buckets,
-                    "invalid input argument bucket_capacities (size mismatch)");
-    for (size_t i = 0; i < num_buckets; ++i) {
-      buckets_.emplace_back(handle, bucket_capacities[i]);
-    }
-    buffer_.set_stream(handle_ptr_->get_stream());
-  }
-
-  Bucket<vertex_t, is_multi_gpu>& get_bucket(size_t bucket_idx) { return buckets_[bucket_idx]; }
-
-  Bucket<vertex_t, is_multi_gpu> const& get_bucket(size_t bucket_idx) const
-  {
-    return buckets_[bucket_idx];
-  }
-
-  void swap_buckets(size_t bucket_idx0, size_t bucket_idx1)
-  {
-    std::swap(buckets_[bucket_idx0], buckets_[bucket_idx1]);
-  }
-
-  template <typename SplitOp>
-  void split_bucket(size_t bucket_idx, SplitOp split_op)
-  {
-    auto constexpr invalid_vertex = invalid_vertex_id<vertex_t>::value;
-
-    auto bucket_and_bucket_size_device_ptrs = get_bucket_and_bucket_size_device_pointers();
-
-    auto& this_bucket = get_bucket(bucket_idx);
-    if (this_bucket.size() > 0) {
-      raft::grid_1d_thread_t move_and_invalidate_if_grid(
-        this_bucket.size(),
-        detail::move_and_invalidate_if_block_size,
-        handle_ptr_->get_device_properties().maxGridSize[0]);
-
-      detail::move_and_invalidate_if<kNumBuckets>
-        <<<move_and_invalidate_if_grid.num_blocks,
-           move_and_invalidate_if_grid.block_size,
-           0,
-           handle_ptr_->get_stream()>>>(this_bucket.begin(),
-                                        this_bucket.end(),
-                                        std::get<0>(bucket_and_bucket_size_device_ptrs).get(),
-                                        std::get<1>(bucket_and_bucket_size_device_ptrs).get(),
-                                        bucket_idx,
-                                        kInvalidBucketIdx,
-                                        invalid_vertex,
-                                        split_op);
-    }
-
-    // FIXME: if we adopt CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups
-    // and global sync(), we can merge this step with the above kernel (and rename the above kernel
-    // to move_if)
-    auto it =
-      thrust::remove_if(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
-                        get_bucket(bucket_idx).begin(),
-                        get_bucket(bucket_idx).end(),
-                        [] __device__(auto value) { return value == invalid_vertex; });
-
-    auto bucket_sizes_device_ptr = std::get<1>(bucket_and_bucket_size_device_ptrs);
-    thrust::host_vector<size_t> bucket_sizes(bucket_sizes_device_ptr,
-                                             bucket_sizes_device_ptr + kNumBuckets);
-    for (size_t i = 0; i < kNumBuckets; ++i) {
-      if (i != bucket_idx) { get_bucket(i).set_size(bucket_sizes[i]); }
-    }
-
-    auto size = thrust::distance(get_bucket(bucket_idx).begin(), it);
-    get_bucket(bucket_idx).set_size(size);
-
-    return;
-  }
-
-  auto get_bucket_and_bucket_size_device_pointers()
-  {
-    thrust::host_vector<vertex_t*> tmp_ptrs(buckets_.size(), nullptr);
-    thrust::host_vector<size_t> tmp_sizes(buckets_.size(), 0);
-    for (size_t i = 0; i < buckets_.size(); ++i) {
-      tmp_ptrs[i]  = get_bucket(i).data();
-      tmp_sizes[i] = get_bucket(i).size();
-    }
-    tmp_bucket_ptrs_  = tmp_ptrs;
-    tmp_bucket_sizes_ = tmp_sizes;
-    return std::make_tuple(tmp_bucket_ptrs_.data(), tmp_bucket_sizes_.data());
-  }
-
-  void resize_buffer(size_t size)
-  {
-    // FIXME: rmm::device_buffer resize incurs copy if memory is reallocated, which is unnecessary
-    // in this case.
-    buffer_.resize(compute_aggregate_buffer_size_in_bytes(size), handle_ptr_->get_stream());
-    if (size > buffer_capacity_) {
-      buffer_capacity_ = size;
-      update_buffer_ptrs();
-    }
-    buffer_size_ = size;
-  }
-
-  void clear_buffer() { resize_buffer(0); }
-
-  void shrink_to_fit_buffer()
-  {
-    if (buffer_size_ != buffer_capacity_) {
-      // FIXME: rmm::device_buffer shrink_to_fit incurs copy if memory is reallocated, which is
-      // unnecessary in this case.
-      buffer_.shrink_to_fit(handle_ptr_->get_stream());
-      update_buffer_ptrs();
-      buffer_capacity_ = buffer_size_;
-    }
-  }
-
-  auto buffer_begin()
-  {
-    return detail::make_buffer_zip_iterator<ReduceInputTupleType, vertex_t>(buffer_ptrs_, 0);
-  }
-
-  auto buffer_end()
-  {
-    return detail::make_buffer_zip_iterator<ReduceInputTupleType, vertex_t>(buffer_ptrs_,
-                                                                            buffer_size_);
-  }
-
-  auto get_buffer_idx_ptr() { return buffer_idx_.data(); }
-
-  size_t get_buffer_idx_value() { return buffer_idx_.value(handle_ptr_->get_stream()); }
-
-  void set_buffer_idx_value(size_t value)
-  {
-    buffer_idx_.set_value(value, handle_ptr_->get_stream());
-  }
-
- private:
-  static size_t constexpr kReduceInputTupleSize = thrust::tuple_size<ReduceInputTupleType>::value;
-  static size_t constexpr kBufferAlignment      = 128;
-
-  raft::handle_t const* handle_ptr_{nullptr};
-  std::vector<Bucket<vertex_t, is_multi_gpu>> buckets_{};
-  rmm::device_vector<vertex_t*> tmp_bucket_ptrs_{};
-  rmm::device_vector<size_t> tmp_bucket_sizes_{};
-
-  std::array<size_t, kReduceInputTupleSize> tuple_element_sizes_ =
-    compute_thrust_tuple_element_sizes<ReduceInputTupleType>()();
-  std::vector<void*> buffer_ptrs_{};
-  rmm::device_buffer buffer_{};
-  size_t buffer_size_{0};
-  size_t buffer_capacity_{0};
-  rmm::device_scalar<size_t> buffer_idx_{};
-
-  // FIXME: better pick between this apporach or the approach used in allocate_comm_buffer
-  size_t compute_aggregate_buffer_size_in_bytes(size_t size)
-  {
-    size_t aggregate_buffer_size_in_bytes =
-      detail::round_up(sizeof(vertex_t) * size, kBufferAlignment);
-    for (size_t i = 0; i < kReduceInputTupleSize; ++i) {
-      aggregate_buffer_size_in_bytes +=
-        detail::round_up(tuple_element_sizes_[i] * size, kBufferAlignment);
-    }
-    return aggregate_buffer_size_in_bytes;
-  }
-
-  void update_buffer_ptrs()
-  {
-    uintptr_t ptr   = reinterpret_cast<uintptr_t>(buffer_.data());
-    buffer_ptrs_[0] = reinterpret_cast<void*>(ptr);
-    ptr += detail::round_up(sizeof(vertex_t) * buffer_capacity_, kBufferAlignment);
-    for (size_t i = 0; i < kReduceInputTupleSize; ++i) {
-      buffer_ptrs_[1 + i] = reinterpret_cast<void*>(ptr);
-      ptr += detail::round_up(tuple_element_sizes_[i] * buffer_capacity_, kBufferAlignment);
-    }
-  }
-};
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp
deleted file mode 100644
index cd621a516ea..00000000000
--- a/cpp/include/utilities/cython.hpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <experimental/graph.hpp>
-#include <graph.hpp>
-#include <raft/handle.hpp>
-
-namespace cugraph {
-namespace cython {
-
-enum class numberTypeEnum : int { int32Type, int64Type, floatType, doubleType };
-
-enum class graphTypeEnum : int {
-  // represents unintiialized or NULL ptr
-  null,
-  // represents some legacy Cxx type. This and other LegacyCxx values are not
-  // used for the unique_ptr in a graph_container_t, but instead for when this
-  // enum is used for determining high-level code paths to take to prevent
-  // needing to expose each legacy enum value to cython.
-  LegacyCSR,
-  LegacyCSC,
-  LegacyCOO,
-  // represents that a GraphCxxView* unique_ptr type is present in a
-  // graph_container_t.
-  GraphCSRViewFloat,
-  GraphCSRViewDouble,
-  GraphCSCViewFloat,
-  GraphCSCViewDouble,
-  GraphCOOViewFloat,
-  GraphCOOViewDouble,
-  // represents values present in the graph_container_t to construct a graph_t,
-  // but unlike legacy classes does not mean a graph_t unique_ptr is present in
-  // the container.
-  graph_t,
-};
-
-// "container" for a graph type instance which insulates the owner from the
-// specifics of the actual graph type. This is intended to be used in Cython
-// code that only needs to pass a graph object to another wrapped C++ API. This
-// greatly simplifies the Cython code since the Cython definition only needs to
-// define the container and not the various individual graph types in Cython.
-struct graph_container_t {
-  // FIXME: This union is in place only to support legacy calls, remove when
-  // migration to graph_t types is complete, or when legacy graph objects are
-  // constructed in the call_<<algo> wrappers instead of the
-  // populate_graph_container_legacy() function.
-  union graphPtrUnion {
-    ~graphPtrUnion() {}
-
-    void* null;
-    std::unique_ptr<GraphCSRView<int32_t, int32_t, float>> GraphCSRViewFloatPtr;
-    std::unique_ptr<GraphCSRView<int32_t, int32_t, double>> GraphCSRViewDoublePtr;
-    std::unique_ptr<GraphCSCView<int32_t, int32_t, float>> GraphCSCViewFloatPtr;
-    std::unique_ptr<GraphCSCView<int32_t, int32_t, double>> GraphCSCViewDoublePtr;
-    std::unique_ptr<GraphCOOView<int32_t, int32_t, float>> GraphCOOViewFloatPtr;
-    std::unique_ptr<GraphCOOView<int32_t, int32_t, double>> GraphCOOViewDoublePtr;
-  };
-
-  graph_container_t() : graph_ptr_union{nullptr}, graph_type{graphTypeEnum::null} {}
-  ~graph_container_t() {}
-
-  // The expected usage of a graph_container_t is for it to be created as part
-  // of a cython wrapper simply for passing a templated instantiation of a
-  // particular graph class from one call to another, and not to exist outside
-  // of the individual wrapper function (deleted when the instance goes out of
-  // scope once the wrapper function returns). Therefore, copys and assignments
-  // to an instance are not supported and these methods are deleted.
-  graph_container_t(const graph_container_t&) = delete;
-  graph_container_t& operator=(const graph_container_t&) = delete;
-
-  graphPtrUnion graph_ptr_union;
-  graphTypeEnum graph_type;
-
-  // primitive data used for constructing graph_t instances.
-  void* src_vertices;
-  void* dst_vertices;
-  void* weights;
-  void* vertex_partition_offsets;
-
-  size_t num_partition_edges;
-  size_t num_global_vertices;
-  size_t num_global_edges;
-  numberTypeEnum vertexType;
-  numberTypeEnum edgeType;
-  numberTypeEnum weightType;
-  bool transposed;
-  bool is_multi_gpu;
-  bool sorted_by_degree;
-  bool do_expensive_check;
-  bool hypergraph_partitioned;
-  int row_comm_size;
-  int col_comm_size;
-  int row_comm_rank;
-  int col_comm_rank;
-  experimental::graph_properties_t graph_props;
-};
-
-// FIXME: finish description for vertex_partition_offsets
-//
-// Factory function for populating an empty graph container with a new graph
-// object from basic types, and sets the corresponding meta-data. Args are:
-//
-// graph_container_t& graph_container
-//   Reference to the graph_container_t instance to
-//   populate. populate_graph_container() can only be called on an "empty"
-//   container (ie. a container that has not been previously populated by
-//   populate_graph_container())
-//
-// graphTypeEnum legacyType
-//   Specifies the type of graph when instantiating a legacy graph type
-//   (GraphCSRViewFloat, etc.).
-//   NOTE: this parameter will be removed when the transition to exclusinve use
-//   of the new 2D graph classes is complete.
-//
-// raft::handle_t const& handle
-//   Raft handle to be set on the new graph instance in the container
-//
-// void* src_vertices, dst_vertices, weights
-//   Pointer to an array of values representing source and destination vertices,
-//   and edge weights respectively. The value types of the array are specified
-//   using numberTypeEnum values separately (see below). offsets should be size
-//   num_vertices+1, indices should be size num_edges, weights should also be
-//   size num_edges
-//
-// void* vertex_partition_offsets
-//   Pointer to an array of vertexType values representing offsets into the
-//   individual partitions for a multi-GPU paritioned graph. The offsets are used for ...
-//
-// numberTypeEnum vertexType, edgeType, weightType
-//   numberTypeEnum enum value describing the data type for the vertices,
-//   offsets, and weights arrays respectively. These enum values are used to
-//   instantiate the proper templated graph type and for casting the arrays
-//   accordingly.
-//
-// int num_vertices, num_edges
-//   The number of vertices and edges respectively in the graph represented by
-//   the above arrays.
-//
-// bool transposed
-//   true if the resulting graph object should store a transposed adjacency
-//   matrix
-//
-// bool multi_gpu
-//   true if the resulting graph object is to be used for a multi-gpu
-//   application
-void populate_graph_container(graph_container_t& graph_container,
-                              raft::handle_t& handle,
-                              void* src_vertices,
-                              void* dst_vertices,
-                              void* weights,
-                              void* vertex_partition_offsets,
-                              numberTypeEnum vertexType,
-                              numberTypeEnum edgeType,
-                              numberTypeEnum weightType,
-                              size_t num_partition_edges,
-                              size_t num_global_vertices,
-                              size_t num_global_edges,
-                              bool sorted_by_degree,
-                              bool transposed,
-                              bool multi_gpu);
-
-// FIXME: comment this function
-// FIXME: Should local_* values be void* as well?
-void populate_graph_container_legacy(graph_container_t& graph_container,
-                                     graphTypeEnum legacyType,
-                                     raft::handle_t const& handle,
-                                     void* offsets,
-                                     void* indices,
-                                     void* weights,
-                                     numberTypeEnum offsetType,
-                                     numberTypeEnum indexType,
-                                     numberTypeEnum weightType,
-                                     size_t num_global_vertices,
-                                     size_t num_global_edges,
-                                     int* local_vertices,
-                                     int* local_edges,
-                                     int* local_offsets);
-
-// Wrapper for calling Louvain using a graph container
-template <typename weight_t>
-std::pair<size_t, weight_t> call_louvain(raft::handle_t const& handle,
-                                         graph_container_t const& graph_container,
-                                         void* identifiers,
-                                         void* parts,
-                                         size_t max_level,
-                                         weight_t resolution);
-
-// Wrapper for calling Pagerank using a graph container
-template <typename vertex_t, typename weight_t>
-void call_pagerank(raft::handle_t const& handle,
-                   graph_container_t const& graph_container,
-                   vertex_t* identifiers,
-                   weight_t* pagerank,
-                   vertex_t personalization_subset_size,
-                   vertex_t* personalization_subset,
-                   weight_t* personalization_values,
-                   double alpha,
-                   double tolerance,
-                   int64_t max_iter,
-                   bool has_guess);
-
-// Wrapper for calling Katz centrality using a graph container
-template <typename vertex_t, typename weight_t>
-void call_katz_centrality(raft::handle_t const& handle,
-                          graph_container_t const& graph_container,
-                          vertex_t* identifiers,
-                          weight_t* katz_centrality,
-                          double alpha,
-                          double beta,
-                          double tolerance,
-                          int64_t max_iter,
-                          bool normalized,
-                          bool has_guess);
-
-// Wrapper for calling BFS through a graph container
-template <typename vertex_t, typename weight_t>
-void call_bfs(raft::handle_t const& handle,
-              graph_container_t const& graph_container,
-              vertex_t* identifiers,
-              vertex_t* distances,
-              vertex_t* predecessors,
-              double* sp_counters,
-              const vertex_t start_vertex,
-              bool directed);
-
-// Wrapper for calling SSSP through a graph container
-template <typename vertex_t, typename weight_t>
-void call_sssp(raft::handle_t const& handle,
-               graph_container_t const& graph_container,
-               vertex_t* identifiers,
-               weight_t* distances,
-               vertex_t* predecessors,
-               const vertex_t source_vertex);
-
-// Helper for setting up subcommunicators, typically called as part of the
-// user-initiated comms initialization in Python.
-//
-// raft::handle_t& handle
-//   Raft handle for which the new subcommunicators will be created. The
-//   subcommunicators will then be accessible from the handle passed to the
-//   parallel processes.
-//
-// size_t row_comm_size
-//   Number of items in a partition row (ie. pcols), needed for creating the
-//   appropriate number of subcommunicator instances.
-void init_subcomms(raft::handle_t& handle, size_t row_comm_size);
-
-}  // namespace cython
-}  // namespace cugraph
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 9bd3c364329..02434278343 100644
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 import tempfile
 
 
-EXPECTED_VERSION = "8.0.1"
+EXPECTED_VERSION = "11.0.0"
 VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
 # NOTE: populate this list with more top-level dirs as we add more of them to the cugraph repo
 DEFAULT_DIRS = ["cpp/include",
@@ -139,4 +139,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/cpp/src/centrality/README.md b/cpp/src/centrality/README.md
new file mode 100644
index 00000000000..31b5ed6720e
--- /dev/null
+++ b/cpp/src/centrality/README.md
@@ -0,0 +1,81 @@
+# Centrality algorithms
+cuGraph Pagerank is implemented using our graph primitive library
+
+## Pagerank
+
+The unit test code is the best place to search for examples on calling pagerank.
+
+ * [SG Implementation](../../tests/experimental/pagerank_test.cpp)
+ * [MG Implementation](../../tests/pagerank/mg_pagerank_test.cpp)
+
+## Simple pagerank
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the pageranks vector in device memory and pass in the raw pointer to that vector into the pagerank function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+using result_t = weight_t;      // could specify float or double also
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+
+result_t constexpr alpha{0.85};
+result_t constexpr epsilon{1e-6};
+
+rmm::device_uvector<result_t> pageranks_v(graph_view.get_number_of_vertices(), handle.get_stream());
+
+// pagerank optionally supports three additional parameters:
+//     max_iterations     - maximum number of iterations, if pagerank doesn't coverge by
+//                          then we abort
+//     has_initial_guess  - if true, values in the pagerank array when the call is initiated
+//                          will be used as the initial pagerank values.  These values will
+//                          be normalized before use.  If false (the default), the values
+//                          in the pagerank array will be set to 1/num_vertices before
+//                          starting the computation.
+//     do_expensive_check - perform extensive validation of the input data before
+//                          executing algorithm.  Off by default.  Note: turning this on
+//                          is expensive
+cugraph::experimental::pagerank(handle, graph_view, nullptr, nullptr, nullptr, vertex_t{0},
+                                pageranks_v.data(), alpha, epsilon);
+```
+
+## Personalized Pagerank
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the pageranks vector in device memory and pass in the raw pointer to that vector into the pagerank function.  Additionally, the caller must create personalization_vertices and personalized_values vectors in device memory, populate them and pass in the raw pointers to those vectors.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;                    // or int64_t, whichever is appropriate
+using weight_t = float;                      // or double, whichever is appropriate
+using result_t = weight_t;                   // could specify float or double also
+raft::handle_t handle;                       // Must be configured if MG
+auto graph_view = graph.view();              // assumes you have created a graph somehow
+vertex_t number_of_personalization_vertices; // Provided by caller
+
+result_t constexpr alpha{0.85};
+result_t constexpr epsilon{1e-6};
+
+rmm::device_uvector<result_t> pageranks_v(graph_view.get_number_of_vertices(), handle.get_stream());
+rmm::device_uvector<vertex_t> personalization_vertices(number_of_personalization_vertices, handle.get_stream());
+rmm::device_uvector<result_t> personalization_values(number_of_personalization_vertices, handle.get_stream());
+
+//  Populate personalization_vertices, personalization_values with user provided data
+
+// pagerank optionally supports three additional parameters:
+//     max_iterations     - maximum number of iterations, if pagerank doesn't coverge by
+//                          then we abort
+//     has_initial_guess  - if true, values in the pagerank array when the call is initiated
+//                          will be used as the initial pagerank values.  These values will
+//                          be normalized before use.  If false (the default), the values
+//                          in the pagerank array will be set to 1/num_vertices before
+//                          starting the computation.
+//     do_expensive_check - perform extensive validation of the input data before
+//                          executing algorithm.  Off by default.  Note: turning this on
+//                          is expensive
+cugraph::experimental::pagerank(handle, graph_view, nullptr, personalization_vertices.data(),
+                                personalization_values.data(), number_of_personalization_vertices,
+                                pageranks_v.data(), alpha, epsilon);
+```
diff --git a/cpp/src/centrality/betweenness_centrality.cu b/cpp/src/centrality/betweenness_centrality.cu
index 8ff62f7ddb6..6949399b4b7 100644
--- a/cpp/src/centrality/betweenness_centrality.cu
+++ b/cpp/src/centrality/betweenness_centrality.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,12 @@
 
 #include <raft/cudart_utils.h>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+
 #include <rmm/device_scalar.hpp>
-#include <utilities/error.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <raft/handle.hpp>
 #include "betweenness_centrality.cuh"
@@ -33,14 +35,14 @@ namespace cugraph {
 namespace detail {
 namespace {
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality_impl(raft::handle_t const &handle,
-                                 GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                 result_t *result,
+void betweenness_centrality_impl(raft::handle_t const& handle,
+                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                 result_t* result,
                                  bool normalize,
                                  bool endpoints,
-                                 weight_t const *weight,
+                                 weight_t const* weight,
                                  vertex_t number_of_sources,
-                                 vertex_t const *sources,
+                                 vertex_t const* sources,
                                  vertex_t total_number_of_sources)
 {
   // Current Implementation relies on BFS
@@ -57,13 +59,13 @@ void betweenness_centrality_impl(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality_impl(raft::handle_t const &handle,
-                                      GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                      result_t *result,
+void edge_betweenness_centrality_impl(raft::handle_t const& handle,
+                                      legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                      result_t* result,
                                       bool normalize,
-                                      weight_t const *weight,
+                                      weight_t const* weight,
                                       vertex_t number_of_sources,
-                                      vertex_t const *sources,
+                                      vertex_t const* sources,
                                       vertex_t total_number_of_sources)
 {
   // Current Implementation relies on BFS
@@ -82,7 +84,7 @@ void edge_betweenness_centrality_impl(raft::handle_t const &handle,
   // bc.rescale_by_total_sources_used(total_number_of_sources);
 }
 template <typename vertex_t>
-vertex_t get_total_number_of_sources(raft::handle_t const &handle, vertex_t local_number_of_sources)
+vertex_t get_total_number_of_sources(raft::handle_t const& handle, vertex_t local_number_of_sources)
 {
   vertex_t total_number_of_sources_used = local_number_of_sources;
   if (handle.comms_initialized()) {
@@ -101,13 +103,13 @@ vertex_t get_total_number_of_sources(raft::handle_t const &handle, vertex_t loca
 }  // namespace
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void verify_betweenness_centrality_input(result_t *result,
+void verify_betweenness_centrality_input(result_t* result,
                                          bool is_edge_betweenness,
                                          bool normalize,
                                          bool endpoints,
-                                         weight_t const *weights,
+                                         weight_t const* weights,
                                          vertex_t const number_of_sources,
-                                         vertex_t const *sources)
+                                         vertex_t const* sources)
 {
   static_assert(std::is_same<vertex_t, int>::value, "vertex_t should be int");
   static_assert(std::is_same<edge_t, int>::value, "edge_t should be int");
@@ -116,7 +118,7 @@ void verify_betweenness_centrality_input(result_t *result,
   static_assert(std::is_same<result_t, float>::value || std::is_same<result_t, double>::value,
                 "result_t should be float or double");
 
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: betwenness pointer is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: betwenness pointer is NULL");
   CUGRAPH_EXPECTS(number_of_sources >= 0, "Number of sources must be positive or equal to 0.");
   if (number_of_sources != 0) {
     CUGRAPH_EXPECTS(sources != nullptr,
@@ -137,12 +139,12 @@ void BC<vertex_t, edge_t, weight_t, result_t>::setup()
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::configure(result_t *betweenness,
+void BC<vertex_t, edge_t, weight_t, result_t>::configure(result_t* betweenness,
                                                          bool is_edge_betweenness,
                                                          bool normalized,
                                                          bool endpoints,
-                                                         weight_t const *weights,
-                                                         vertex_t const *sources,
+                                                         weight_t const* weights,
+                                                         vertex_t const* sources,
                                                          vertex_t number_of_sources)
 {
   // --- Bind betweenness output vector to internal ---
@@ -227,15 +229,13 @@ void BC<vertex_t, edge_t, weight_t, result_t>::compute_single_source(vertex_t so
   // the traversal, this value is avalaible within the bfs implementation and
   // there could be a way to access it directly and avoid both replace and the
   // max
-  thrust::replace(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+  thrust::replace(rmm::exec_policy(handle_.get_stream_view()),
                   distances_,
                   distances_ + number_of_vertices_,
                   std::numeric_limits<vertex_t>::max(),
                   static_cast<vertex_t>(-1));
-  auto current_max_depth =
-    thrust::max_element(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
-                        distances_,
-                        distances_ + number_of_vertices_);
+  auto current_max_depth = thrust::max_element(
+    rmm::exec_policy(handle_.get_stream_view()), distances_, distances_ + number_of_vertices_);
   vertex_t max_depth = 0;
   CUDA_TRY(cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost));
   // Step 2) Dependency accumulation
@@ -265,7 +265,7 @@ void BC<vertex_t, edge_t, weight_t, result_t>::accumulate(vertex_t source_vertex
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 void BC<vertex_t, edge_t, weight_t, result_t>::initialize_dependencies()
 {
-  thrust::fill(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+  thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
                deltas_,
                deltas_ + number_of_vertices_,
                static_cast<result_t>(0));
@@ -315,16 +315,13 @@ template <typename vertex_t, typename edge_t, typename weight_t, typename result
 void BC<vertex_t, edge_t, weight_t, result_t>::add_reached_endpoints_to_source_betweenness(
   vertex_t source_vertex)
 {
-  vertex_t number_of_unvisited_vertices =
-    thrust::count(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
-                  distances_,
-                  distances_ + number_of_vertices_,
-                  -1);
+  vertex_t number_of_unvisited_vertices = thrust::count(
+    rmm::exec_policy(handle_.get_stream_view()), distances_, distances_ + number_of_vertices_, -1);
   vertex_t number_of_visited_vertices_except_source =
     number_of_vertices_ - number_of_unvisited_vertices - 1;
   rmm::device_vector<vertex_t> buffer(1);
   buffer[0] = number_of_visited_vertices_except_source;
-  thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+  thrust::transform(rmm::exec_policy(handle_.get_stream_view()),
                     buffer.begin(),
                     buffer.end(),
                     betweenness_ + source_vertex,
@@ -335,7 +332,7 @@ void BC<vertex_t, edge_t, weight_t, result_t>::add_reached_endpoints_to_source_b
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 void BC<vertex_t, edge_t, weight_t, result_t>::add_vertices_dependencies_to_betweenness()
 {
-  thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+  thrust::transform(rmm::exec_policy(handle_.get_stream_view()),
                     deltas_,
                     deltas_ + number_of_vertices_,
                     betweenness_,
@@ -420,7 +417,7 @@ void BC<vertex_t, edge_t, weight_t, result_t>::apply_rescale_factor_to_betweenne
 {
   size_t result_size = number_of_vertices_;
   if (is_edge_betweenness_) result_size = number_of_edges_;
-  thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+  thrust::transform(rmm::exec_policy(handle_.get_stream_view()),
                     betweenness_,
                     betweenness_ + result_size,
                     thrust::make_constant_iterator(rescale_factor),
@@ -451,14 +448,14 @@ void BC<vertex_t, edge_t, weight_t, result_t>::rescale_by_total_sources_used(
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(raft::handle_t const &handle,
-                            GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                            result_t *result,
+void betweenness_centrality(raft::handle_t const& handle,
+                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                            result_t* result,
                             bool normalize,
                             bool endpoints,
-                            weight_t const *weight,
+                            weight_t const* weight,
                             vertex_t k,
-                            vertex_t const *vertices)
+                            vertex_t const* vertices)
 {
   vertex_t total_number_of_sources_used = detail::get_total_number_of_sources<vertex_t>(handle, k);
   if (handle.comms_initialized()) {
@@ -491,32 +488,33 @@ void betweenness_centrality(raft::handle_t const &handle,
   }
 }
 
-template void betweenness_centrality<int, int, float, float>(const raft::handle_t &,
-                                                             GraphCSRView<int, int, float> const &,
-                                                             float *,
-                                                             bool,
-                                                             bool,
-                                                             float const *,
-                                                             int,
-                                                             int const *);
+template void betweenness_centrality<int, int, float, float>(
+  const raft::handle_t&,
+  legacy::GraphCSRView<int, int, float> const&,
+  float*,
+  bool,
+  bool,
+  float const*,
+  int,
+  int const*);
 template void betweenness_centrality<int, int, double, double>(
-  const raft::handle_t &,
-  GraphCSRView<int, int, double> const &,
-  double *,
+  const raft::handle_t&,
+  legacy::GraphCSRView<int, int, double> const&,
+  double*,
   bool,
   bool,
-  double const *,
+  double const*,
   int,
-  int const *);
+  int const*);
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(raft::handle_t const &handle,
-                                 GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                 result_t *result,
+void edge_betweenness_centrality(raft::handle_t const& handle,
+                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                 result_t* result,
                                  bool normalize,
-                                 weight_t const *weight,
+                                 weight_t const* weight,
                                  vertex_t k,
-                                 vertex_t const *vertices)
+                                 vertex_t const* vertices)
 {
   vertex_t total_number_of_sources_used = detail::get_total_number_of_sources<vertex_t>(handle, k);
   if (handle.comms_initialized()) {
@@ -542,20 +540,20 @@ void edge_betweenness_centrality(raft::handle_t const &handle,
 }
 
 template void edge_betweenness_centrality<int, int, float, float>(
-  const raft::handle_t &,
-  GraphCSRView<int, int, float> const &,
-  float *,
+  const raft::handle_t&,
+  legacy::GraphCSRView<int, int, float> const&,
+  float*,
   bool,
-  float const *,
+  float const*,
   int,
-  int const *);
+  int const*);
 
 template void edge_betweenness_centrality<int, int, double, double>(
-  raft::handle_t const &handle,
-  GraphCSRView<int, int, double> const &,
-  double *,
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<int, int, double> const&,
+  double*,
   bool,
-  double const *,
+  double const*,
   int,
-  int const *);
+  int const*);
 }  // namespace cugraph
diff --git a/cpp/src/centrality/betweenness_centrality.cuh b/cpp/src/centrality/betweenness_centrality.cuh
index 418ac06faa4..706b8bfebac 100644
--- a/cpp/src/centrality/betweenness_centrality.cuh
+++ b/cpp/src/centrality/betweenness_centrality.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,69 +22,69 @@
 namespace cugraph {
 namespace detail {
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(raft::handle_t const &handle,
-                            GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                            result_t *result,
+void betweenness_centrality(raft::handle_t const& handle,
+                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                            result_t* result,
                             bool normalize,
                             bool endpoints,
-                            weight_t const *weight,
+                            weight_t const* weight,
                             vertex_t const number_of_sources,
-                            vertex_t const *sources);
+                            vertex_t const* sources);
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                 result_t *result,
+void edge_betweenness_centrality(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                 result_t* result,
                                  bool normalize,
-                                 weight_t const *weight,
+                                 weight_t const* weight,
                                  vertex_t const number_of_sources,
-                                 vertex_t const *sources);
+                                 vertex_t const* sources);
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void verify_betweenness_centrality_input(result_t *result,
+void verify_betweenness_centrality_input(result_t* result,
                                          bool is_edge_betweenness,
                                          bool normalize,
                                          bool endpoints,
-                                         weight_t const *weights,
+                                         weight_t const* weights,
                                          vertex_t const number_of_sources,
-                                         vertex_t const *sources);
+                                         vertex_t const* sources);
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 class BC {
  public:
   virtual ~BC(void) {}
-  BC(raft::handle_t const &handle,
-     GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+  BC(raft::handle_t const& handle,
+     legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
      cudaStream_t stream = 0)
     : handle_(handle), graph_(graph)
   {
     setup();
   }
-  void configure(result_t *betweenness,
+  void configure(result_t* betweenness,
                  bool is_edge_betweenness,
                  bool normalize,
                  bool endpoints,
-                 weight_t const *weight,
-                 vertex_t const *sources,
+                 weight_t const* weight,
+                 vertex_t const* sources,
                  vertex_t const number_of_sources);
 
-  void configure_edge(result_t *betweenness,
+  void configure_edge(result_t* betweenness,
                       bool normalize,
-                      weight_t const *weight,
-                      vertex_t const *sources,
+                      weight_t const* weight,
+                      vertex_t const* sources,
                       vertex_t const number_of_sources);
   void compute();
   void rescale_by_total_sources_used(vertex_t total_number_of_sources_used);
 
  private:
   // --- RAFT handle ---
-  raft::handle_t const &handle_;
+  raft::handle_t const& handle_;
   // --- Information concerning the graph ---
-  const GraphCSRView<vertex_t, edge_t, weight_t> &graph_;
+  const legacy::GraphCSRView<vertex_t, edge_t, weight_t>& graph_;
   // --- These information are extracted on setup ---
   vertex_t number_of_vertices_;  // Number of vertices in the graph
   vertex_t number_of_edges_;     // Number of edges in the graph
-  edge_t const *offsets_ptr_;    // Pointer to the offsets
-  vertex_t const *indices_ptr_;  // Pointers to the indices
+  edge_t const* offsets_ptr_;    // Pointer to the offsets
+  vertex_t const* indices_ptr_;  // Pointers to the indices
 
   // --- Information from configuration ---
   bool configured_          = false;  // Flag to ensure configuration was called
@@ -92,14 +92,14 @@ class BC {
   bool is_edge_betweenness_ = false;  // If True compute edge_betweeness
 
   // FIXME: For weighted version
-  weight_t const *edge_weights_ptr_ = nullptr;  // Pointer to the weights
+  weight_t const* edge_weights_ptr_ = nullptr;  // Pointer to the weights
   bool endpoints_                   = false;    // If True normalize the betweenness
-  vertex_t const *sources_          = nullptr;  // Subset of vertices to gather information from
+  vertex_t const* sources_          = nullptr;  // Subset of vertices to gather information from
   vertex_t number_of_sources_;                  // Number of vertices in sources
 
   // --- Output ----
   // betweenness is set/read by users - using Vectors
-  result_t *betweenness_ = nullptr;
+  result_t* betweenness_ = nullptr;
 
   // --- Data required to perform computation ----
   rmm::device_vector<vertex_t> distances_vec_;
@@ -107,13 +107,13 @@ class BC {
   rmm::device_vector<double> sp_counters_vec_;
   rmm::device_vector<double> deltas_vec_;
 
-  vertex_t *distances_ =
+  vertex_t* distances_ =
     nullptr;  // array<vertex_t>(|V|) stores the distances gathered by the latest SSSP
-  vertex_t *predecessors_ =
+  vertex_t* predecessors_ =
     nullptr;  // array<weight_t>(|V|) stores the predecessors of the latest SSSP
-  double *sp_counters_ =
+  double* sp_counters_ =
     nullptr;  // array<vertex_t>(|V|) stores the shortest path counter for the latest SSSP
-  double *deltas_ = nullptr;  // array<result_t>(|V|) stores the dependencies for the latest SSSP
+  double* deltas_ = nullptr;  // array<result_t>(|V|) stores the dependencies for the latest SSSP
 
   int max_grid_dim_1D_  = 0;
   int max_block_dim_1D_ = 0;
diff --git a/cpp/src/centrality/betweenness_centrality_kernels.cuh b/cpp/src/centrality/betweenness_centrality_kernels.cuh
index 3cb5add8ad6..27666095375 100644
--- a/cpp/src/centrality/betweenness_centrality_kernels.cuh
+++ b/cpp/src/centrality/betweenness_centrality_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,13 +25,13 @@ namespace detail {
 //        Should look into forAllEdge type primitive for different
 //        load balancing
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void edges_accumulation_kernel(result_t *betweenness,
+__global__ void edges_accumulation_kernel(result_t* betweenness,
                                           vertex_t number_vertices,
-                                          vertex_t const *indices,
-                                          edge_t const *offsets,
-                                          vertex_t *distances,
-                                          double *sp_counters,
-                                          double *deltas,
+                                          vertex_t const* indices,
+                                          edge_t const* offsets,
+                                          vertex_t* distances,
+                                          double* sp_counters,
+                                          double* deltas,
                                           vertex_t depth)
 {
   for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
@@ -58,13 +58,13 @@ __global__ void edges_accumulation_kernel(result_t *betweenness,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void endpoints_accumulation_kernel(result_t *betweenness,
+__global__ void endpoints_accumulation_kernel(result_t* betweenness,
                                               vertex_t number_vertices,
-                                              vertex_t const *indices,
-                                              edge_t const *offsets,
-                                              vertex_t *distances,
-                                              double *sp_counters,
-                                              double *deltas,
+                                              vertex_t const* indices,
+                                              edge_t const* offsets,
+                                              vertex_t* distances,
+                                              double* sp_counters,
+                                              double* deltas,
                                               vertex_t depth)
 {
   for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
@@ -88,13 +88,13 @@ __global__ void endpoints_accumulation_kernel(result_t *betweenness,
   }
 }
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void accumulation_kernel(result_t *betweenness,
+__global__ void accumulation_kernel(result_t* betweenness,
                                     vertex_t number_vertices,
-                                    vertex_t const *indices,
-                                    edge_t const *offsets,
-                                    vertex_t *distances,
-                                    double *sp_counters,
-                                    double *deltas,
+                                    vertex_t const* indices,
+                                    edge_t const* offsets,
+                                    vertex_t* distances,
+                                    double* sp_counters,
+                                    double* deltas,
                                     vertex_t depth)
 {
   for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
@@ -117,4 +117,4 @@ __global__ void accumulation_kernel(result_t *betweenness,
   }
 }
 }  // namespace detail
-}  // namespace cugraph
\ No newline at end of file
+}  // namespace cugraph
diff --git a/cpp/src/centrality/katz_centrality.cu b/cpp/src/centrality/katz_centrality.cu
index 0119a388680..320d76e5c03 100644
--- a/cpp/src/centrality/katz_centrality.cu
+++ b/cpp/src/centrality/katz_centrality.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@
 
 #include <Hornet.hpp>
 #include <Static/KatzCentrality/Katz.cuh>
-#include <graph.hpp>
-#include "utilities/error.hpp"
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 
 namespace cugraph {
 
 template <typename VT, typename ET, typename WT, typename result_t>
-void katz_centrality(GraphCSRView<VT, ET, WT> const &graph,
-                     result_t *result,
+void katz_centrality(legacy::GraphCSRView<VT, ET, WT> const& graph,
+                     result_t* result,
                      double alpha,
                      int max_iter,
                      double tol,
@@ -52,6 +52,6 @@ void katz_centrality(GraphCSRView<VT, ET, WT> const &graph,
 }
 
 template void katz_centrality<int, int, float, double>(
-  GraphCSRView<int, int, float> const &, double *, double, int, double, bool, bool);
+  legacy::GraphCSRView<int, int, float> const&, double*, double, int, double, bool, bool);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/ECG.cu b/cpp/src/community/ECG.cu
deleted file mode 100644
index ce7e9dd1ad2..00000000000
--- a/cpp/src/community/ECG.cu
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithms.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <thrust/random.h>
-#include <converters/permute_graph.cuh>
-#include <ctime>
-#include <utilities/error.hpp>
-#include "utilities/graph_utils.cuh"
-
-namespace {
-template <typename IndexType>
-__device__ IndexType
-binsearch_maxle(const IndexType *vec, const IndexType val, IndexType low, IndexType high)
-{
-  while (true) {
-    if (low == high) return low;  // we know it exists
-    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
-
-    IndexType mid = low + (high - low) / 2;
-
-    if (vec[mid] > val)
-      high = mid - 1;
-    else
-      low = mid;
-  }
-}
-
-template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(IdxT size,
-                                   IdxT num_verts,
-                                   IdxT *offsets,
-                                   IdxT *indices,
-                                   IdxT *permutation,
-                                   IdxT *parts,
-                                   ValT *weights)
-{
-  IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
-  while (tid < size) {
-    IdxT source = binsearch_maxle(offsets, tid, (IdxT)0, num_verts);
-    IdxT dest   = indices[tid];
-    if (parts[permutation[source]] == parts[permutation[dest]]) weights[tid] += 1;
-    tid += gridDim.x * blockDim.x;
-  }
-}
-
-struct prg {
-  __host__ __device__ float operator()(int n)
-  {
-    thrust::default_random_engine rng;
-    thrust::uniform_real_distribution<float> dist(0.0, 1.0);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-template <typename ValT>
-struct update_functor {
-  ValT min_value;
-  ValT ensemble_size;
-  update_functor(ValT minv, ValT es) : min_value(minv), ensemble_size(es) {}
-  __host__ __device__ ValT operator()(ValT input)
-  {
-    return min_value + (1 - min_value) * (input / ensemble_size);
-  }
-};
-
-/**
- * Computes a random permutation vector of length size. A permutation vector of length n
- * contains all values [0..n-1] exactly once.
- * @param size The length of the permutation vector to generate
- * @param seed A seed value for the random number generator, the generator will discard this many
- * values before using values. Calling this method with the same seed will result in the same
- * permutation vector.
- * @return A pointer to memory containing the requested permutation vector. The caller is
- * responsible for freeing the allocated memory using ALLOC_FREE_TRY().
- */
-template <typename T>
-void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream)
-{
-  rmm::device_vector<float> randoms_v(size);
-
-  thrust::counting_iterator<uint32_t> index(seed);
-  thrust::transform(
-    rmm::exec_policy(stream)->on(stream), index, index + size, randoms_v.begin(), prg());
-  thrust::sequence(rmm::exec_policy(stream)->on(stream), permutation, permutation + size, 0);
-  thrust::sort_by_key(
-    rmm::exec_policy(stream)->on(stream), randoms_v.begin(), randoms_v.end(), permutation);
-}
-
-}  // anonymous namespace
-
-namespace cugraph {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void ecg(raft::handle_t const &handle,
-         GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-         weight_t min_weight,
-         vertex_t ensemble_size,
-         vertex_t *clustering)
-{
-  CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is NULL");
-
-  cudaStream_t stream{0};
-
-  rmm::device_vector<weight_t> ecg_weights_v(graph.edge_data,
-                                             graph.edge_data + graph.number_of_edges);
-
-  vertex_t size{graph.number_of_vertices};
-  vertex_t seed{1};
-
-  auto permuted_graph = std::make_unique<GraphCSR<vertex_t, edge_t, weight_t>>(
-    size, graph.number_of_edges, graph.has_data());
-
-  // Iterate over each member of the ensemble
-  for (vertex_t i = 0; i < ensemble_size; i++) {
-    // Take random permutation of the graph
-    rmm::device_vector<vertex_t> permutation_v(size);
-    vertex_t *d_permutation = permutation_v.data().get();
-
-    get_permutation_vector(size, seed, d_permutation, stream);
-    seed += size;
-
-    detail::permute_graph<vertex_t, edge_t, weight_t>(graph, d_permutation, permuted_graph->view());
-
-    // Run one level of Louvain clustering on the random permutation
-    rmm::device_vector<vertex_t> parts_v(size);
-    vertex_t *d_parts = parts_v.data().get();
-
-    cugraph::louvain(handle, permuted_graph->view(), d_parts, size_t{1});
-
-    // For each edge in the graph determine whether the endpoints are in the same partition
-    // Keep a sum for each edge of the total number of times its endpoints are in the same partition
-    dim3 grid, block;
-    block.x = 512;
-    grid.x  = min(vertex_t{CUDA_MAX_BLOCKS}, (graph.number_of_edges / 512 + 1));
-    match_check_kernel<<<grid, block, 0, stream>>>(graph.number_of_edges,
-                                                   graph.number_of_vertices,
-                                                   graph.offsets,
-                                                   graph.indices,
-                                                   permutation_v.data().get(),
-                                                   d_parts,
-                                                   ecg_weights_v.data().get());
-  }
-
-  // Set weights = min_weight + (1 - min-weight)*sum/ensemble_size
-  update_functor<weight_t> uf(min_weight, ensemble_size);
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
-                    ecg_weights_v.data().get(),
-                    ecg_weights_v.data().get() + graph.number_of_edges,
-                    ecg_weights_v.data().get(),
-                    uf);
-
-  // Run Louvain on the original graph using the computed weights
-  // (pass max_level = 100 for a "full run")
-  GraphCSRView<vertex_t, edge_t, weight_t> louvain_graph;
-  louvain_graph.indices            = graph.indices;
-  louvain_graph.offsets            = graph.offsets;
-  louvain_graph.edge_data          = ecg_weights_v.data().get();
-  louvain_graph.number_of_vertices = graph.number_of_vertices;
-  louvain_graph.number_of_edges    = graph.number_of_edges;
-
-  cugraph::louvain(handle, louvain_graph, clustering, size_t{100});
-}
-
-// Explicit template instantiations.
-template void ecg<int32_t, int32_t, float>(raft::handle_t const &,
-                                           GraphCSRView<int32_t, int32_t, float> const &graph,
-                                           float min_weight,
-                                           int32_t ensemble_size,
-                                           int32_t *clustering);
-template void ecg<int32_t, int32_t, double>(raft::handle_t const &,
-                                            GraphCSRView<int32_t, int32_t, double> const &graph,
-                                            double min_weight,
-                                            int32_t ensemble_size,
-                                            int32_t *clustering);
-}  // namespace cugraph
diff --git a/cpp/src/community/README.md b/cpp/src/community/README.md
new file mode 100644
index 00000000000..9d635a6167f
--- /dev/null
+++ b/cpp/src/community/README.md
@@ -0,0 +1,79 @@
+# Louvain and Related Clustering Algorithms
+cuGraph contains a GPU implementation of the Louvain algorithm and several related clustering algorithms (Leiden and ECG).
+
+## Louvain
+
+The Louvain implementation is designed to assign clusters attempting to optimize modularity.  The algorithm is derived from the serial implementation described in the following paper:
+
+ * VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of community hierarchies in large networks, J Stat Mech P10008 (2008), http://arxiv.org/abs/0803.0476
+
+It leverages some parallelism ideas from the following paper:
+ * Hao Lu, Mahantesh Halappanavar, Ananth Kalyanaraman: Parallel heuristics for scalable community detection, Elsevier Parallel Computing (2015), https://www.sciencedirect.com/science/article/pii/S0167819115000472
+
+
+The challenge in parallelizing Louvain lies in the primary loop which visits the vertices in serial.  For each vertex v the change in modularity is computed for moving the vertex from its currently assigned cluster to each of the clusters to which v's neighbors are assigned.  The largest positive delta modularity is used to select a new cluster (if there are no positive delta modularities then the vertex is not moved).  If the vertex v is moved to a new cluster then the statistics of the vertex v's old cluster and new cluster change.  This change in cluster statistics may affect the delta modularity computations of all vertices that follow vertex v in the serial iteration, creating a dependency between the different iterations of the loop.
+
+In order to make efficient use of the GPU parallelism, the cuGraph implementation computes the delta modularity for *all* vertex/neighbor pairs using the *current* vertex assignment.  Decisions on moving vertices will be made based upon these delta modularities.  This will potentially make choices that the serial version would not make.  In order to minimize some of the negative effects of this (as described in the Lu paper), the cuGraph implementation uses an Up/Down technique.  In even numbered iterations a vertex can only move from cluster i to cluster j if i > j; in odd numbered iterations a vertex can only move from cluster i to cluster j if i < j.  This prevents two vertices from swapping clusters in the same iteration of the loop.  We have had great success in converging on high modularity clustering using this technique.
+
+## Calling Louvain
+
+The unit test code is the best place to search for examples on calling louvain.
+
+ * [SG Implementation](../../tests/community/louvain_test.cpp)
+ * [MG Implementation](../../tests/community/mg_louvain_test.cpp)
+
+The API itself is very simple.  There are two variations:
+ * Return a flat clustering
+ * Return a Dendrogram
+
+### Return a flat clustering
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the clustering vector in device memory and pass in the raw pointer to that vector into the louvain function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+
+size_t level;
+weight_t modularity;
+
+rmm::device_uvector<vertex_t> clustering_v(graph_view.get_number_of_vertices(), handle.get_stream());
+
+// louvain optionally supports two additional parameters:
+//     max_level - maximum level of the Dendrogram
+//     resolution - constant in the modularity computation
+std::tie(level, modularity) = cugraph::louvain(handle, graph_view, clustering_v.data());
+```
+
+### Return a Dendrogram
+
+The Dendrogram represents the levels of hierarchical clustering that the Louvain algorithm computes.  There is a separate function that will flatten the clustering into the same result as above.  Returning the Dendrogram, however, provides a finer level of detail on the intermediate results which can be helpful in more fully understanding the data.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+
+cugraph::Dendrogram dendrogram;
+weight_t modularity;
+
+// louvain optionally supports two additional parameters:
+//     max_level - maximum level of the Dendrogram
+//     resolution - constant in the modularity computation
+std::tie(dendrogram, modularity) = cugraph::louvain(handle, graph_view);
+
+//  This will get the equivalent result to the earlier example
+rmm::device_uvector<vertex_t> clustering_v(graph_view.get_number_of_vertices(), handle.get_stream());
+cugraph::flatten_dendrogram(handle, graph_view, dendrogram, clustering.data());
+```
+
+## Leiden
+
+## ECG
diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu
new file mode 100644
index 00000000000..ca0f50c4801
--- /dev/null
+++ b/cpp/src/community/ecg.cu
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <community/louvain.cuh>
+#include <converters/permute_graph.cuh>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
+
+#include <thrust/random.h>
+#include <rmm/exec_policy.hpp>
+
+#include <ctime>
+
+namespace {
+template <typename IndexType>
+__device__ IndexType
+binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexType high)
+{
+  while (true) {
+    if (low == high) return low;  // we know it exists
+    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
+
+    IndexType mid = low + (high - low) / 2;
+
+    if (vec[mid] > val)
+      high = mid - 1;
+    else
+      low = mid;
+  }
+}
+
+// FIXME: This shouldn't need to be a custom kernel, this
+//        seems like it should just be a thrust::transform
+template <typename IdxT, typename ValT>
+__global__ void match_check_kernel(
+  IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights)
+{
+  IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
+  while (tid < size) {
+    IdxT source = binsearch_maxle(offsets, tid, (IdxT)0, num_verts);
+    IdxT dest   = indices[tid];
+    if (parts[source] == parts[dest]) weights[tid] += 1;
+    tid += gridDim.x * blockDim.x;
+  }
+}
+
+struct prg {
+  __device__ float operator()(int n)
+  {
+    thrust::default_random_engine rng;
+    thrust::uniform_real_distribution<float> dist(0.0, 1.0);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename ValT>
+struct update_functor {
+  ValT min_value;
+  ValT ensemble_size;
+  update_functor(ValT minv, ValT es) : min_value(minv), ensemble_size(es) {}
+  __host__ __device__ ValT operator()(ValT input)
+  {
+    return min_value + (1 - min_value) * (input / ensemble_size);
+  }
+};
+
+/**
+ * Computes a random permutation vector of length size. A permutation vector of length n
+ * contains all values [0..n-1] exactly once.
+ * @param size The length of the permutation vector to generate
+ * @param seed A seed value for the random number generator, the generator will discard this many
+ * values before using values. Calling this method with the same seed will result in the same
+ * permutation vector.
+ * @return A pointer to memory containing the requested permutation vector. The caller is
+ * responsible for freeing the allocated memory using ALLOC_FREE_TRY().
+ */
+template <typename T>
+void get_permutation_vector(T size, T seed, T* permutation, rmm::cuda_stream_view stream_view)
+{
+  rmm::device_uvector<float> randoms_v(size, stream_view);
+
+  thrust::counting_iterator<uint32_t> index(seed);
+  thrust::transform(rmm::exec_policy(stream_view), index, index + size, randoms_v.begin(), prg());
+  thrust::sequence(rmm::exec_policy(stream_view), permutation, permutation + size, 0);
+  thrust::sort_by_key(
+    rmm::exec_policy(stream_view), randoms_v.begin(), randoms_v.end(), permutation);
+}
+
+template <typename graph_type>
+class EcgLouvain : public cugraph::Louvain<graph_type> {
+ public:
+  using graph_t  = graph_type;
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  EcgLouvain(raft::handle_t const& handle, graph_type const& graph, vertex_t seed)
+    : cugraph::Louvain<graph_type>(handle, graph), seed_(seed)
+  {
+  }
+
+  void initialize_dendrogram_level(vertex_t num_vertices) override
+  {
+    this->dendrogram_->add_level(0, num_vertices, this->handle_.get_stream_view());
+
+    get_permutation_vector(num_vertices,
+                           seed_,
+                           this->dendrogram_->current_level_begin(),
+                           this->handle_.get_stream_view());
+  }
+
+ private:
+  vertex_t seed_;
+};
+
+}  // anonymous namespace
+
+namespace cugraph {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void ecg(raft::handle_t const& handle,
+         legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+         weight_t min_weight,
+         vertex_t ensemble_size,
+         vertex_t* clustering)
+{
+  using graph_type = legacy::GraphCSRView<vertex_t, edge_t, weight_t>;
+
+  CUGRAPH_EXPECTS(graph.edge_data != nullptr,
+                  "Invalid input argument: ecg expects a weighted graph");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is NULL, should be a device pointer to "
+                  "memory for storing the result");
+
+  rmm::device_uvector<weight_t> ecg_weights_v(graph.number_of_edges, handle.get_stream_view());
+
+  thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+               graph.edge_data,
+               graph.edge_data + graph.number_of_edges,
+               ecg_weights_v.data());
+
+  vertex_t size{graph.number_of_vertices};
+
+  // FIXME:  This seed should be a parameter
+  vertex_t seed{1};
+
+  // Iterate over each member of the ensemble
+  for (vertex_t i = 0; i < ensemble_size; i++) {
+    EcgLouvain<graph_type> runner(handle, graph, seed);
+    seed += size;
+
+    weight_t wt = runner(size_t{1}, weight_t{1});
+
+    // For each edge in the graph determine whether the endpoints are in the same partition
+    // Keep a sum for each edge of the total number of times its endpoints are in the same partition
+    dim3 grid, block;
+    block.x = 512;
+    grid.x  = min(vertex_t{CUDA_MAX_BLOCKS}, (graph.number_of_edges / 512 + 1));
+    match_check_kernel<<<grid, block, 0, handle.get_stream()>>>(
+      graph.number_of_edges,
+      graph.number_of_vertices,
+      graph.offsets,
+      graph.indices,
+      runner.get_dendrogram().get_level_ptr_nocheck(0),
+      ecg_weights_v.data());
+  }
+
+  // Set weights = min_weight + (1 - min-weight)*sum/ensemble_size
+  update_functor<weight_t> uf(min_weight, ensemble_size);
+  thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                    ecg_weights_v.begin(),
+                    ecg_weights_v.end(),
+                    ecg_weights_v.begin(),
+                    uf);
+
+  // Run Louvain on the original graph using the computed weights
+  // (pass max_level = 100 for a "full run")
+  legacy::GraphCSRView<vertex_t, edge_t, weight_t> louvain_graph;
+  louvain_graph.indices            = graph.indices;
+  louvain_graph.offsets            = graph.offsets;
+  louvain_graph.edge_data          = ecg_weights_v.data();
+  louvain_graph.number_of_vertices = graph.number_of_vertices;
+  louvain_graph.number_of_edges    = graph.number_of_edges;
+
+  cugraph::louvain(handle, louvain_graph, clustering, size_t{100});
+}
+
+// Explicit template instantiations.
+template void ecg<int32_t, int32_t, float>(
+  raft::handle_t const&,
+  legacy::GraphCSRView<int32_t, int32_t, float> const& graph,
+  float min_weight,
+  int32_t ensemble_size,
+  int32_t* clustering);
+template void ecg<int32_t, int32_t, double>(
+  raft::handle_t const&,
+  legacy::GraphCSRView<int32_t, int32_t, double> const& graph,
+  double min_weight,
+  int32_t ensemble_size,
+  int32_t* clustering);
+}  // namespace cugraph
diff --git a/cpp/src/community/egonet.cu b/cpp/src/community/egonet.cu
new file mode 100644
index 00000000000..1e4569715af
--- /dev/null
+++ b/cpp/src/community/egonet.cu
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Alex Fender afender@nvida.com
+#include <cstddef>
+#include <cugraph/algorithms.hpp>
+#include <memory>
+#include <tuple>
+#include <utility>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform.h>
+#include <ctime>
+
+#include <cugraph/legacy/graph.hpp>
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
+
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <utilities/high_res_timer.hpp>
+
+namespace {
+
+/*
+Description
+Let the egonet graph of a node x be the subgraph that includes node x, the neighborhood of x, and
+all edges between them. Naive algorithm
+- Add center node x to the graph.
+- Go through all the neighbors y of this center node x, add edge (x, y) to the graph.
+- For each neighbor y of center node x, go through all the neighbors z of center node x, if there is
+an edge between y and z in original graph, add edge (y, z) to our new graph.
+
+Rather than doing custom one/two hops features, we propose a generic k-hops solution leveraging BFS
+cutoff and subgraph extraction
+*/
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           rmm::device_uvector<size_t>>
+extract(
+  raft::handle_t const& handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, false, false> const& csr_view,
+  vertex_t* source_vertex,
+  vertex_t n_subgraphs,
+  vertex_t radius)
+{
+  auto v                = csr_view.get_number_of_vertices();
+  auto e                = csr_view.get_number_of_edges();
+  auto user_stream_view = handle.get_stream_view();
+  rmm::device_vector<size_t> neighbors_offsets(n_subgraphs + 1);
+  rmm::device_vector<vertex_t> neighbors;
+
+  std::vector<vertex_t> h_source_vertex(n_subgraphs);
+  std::vector<size_t> h_neighbors_offsets(n_subgraphs + 1);
+
+  raft::update_host(&h_source_vertex[0], source_vertex, n_subgraphs, user_stream_view.value());
+
+  // Streams will allocate concurrently later
+  std::vector<rmm::device_uvector<vertex_t>> reached{};
+  reached.reserve(n_subgraphs);
+  for (vertex_t i = 0; i < n_subgraphs; i++) {
+    // Allocations and operations are attached to the worker stream
+    rmm::device_uvector<vertex_t> local_reach(v, handle.get_internal_stream_view(i));
+    reached.push_back(std::move(local_reach));
+  }
+
+  // h_source_vertex[i] is used by other streams in the for loop
+  user_stream_view.synchronize();
+#ifdef TIMING
+  HighResTimer hr_timer;
+  hr_timer.start("ego_neighbors");
+#endif
+
+  for (vertex_t i = 0; i < n_subgraphs; i++) {
+    // get light handle from worker pool
+    raft::handle_t light_handle(handle, i);
+    auto worker_stream_view = light_handle.get_stream_view();
+
+    // BFS with cutoff
+    // consider adding a device API to BFS (ie. accept source on the device)
+    rmm::device_uvector<vertex_t> predecessors(v, worker_stream_view);  // not used
+    bool direction_optimizing = false;
+    thrust::fill(rmm::exec_policy(worker_stream_view),
+                 reached[i].begin(),
+                 reached[i].end(),
+                 std::numeric_limits<vertex_t>::max());
+    thrust::fill(
+      rmm::exec_policy(worker_stream_view), reached[i].begin(), reached[i].begin() + 100, 1.0);
+
+    cugraph::experimental::bfs<vertex_t, edge_t, weight_t, false>(light_handle,
+                                                                  csr_view,
+                                                                  reached[i].data(),
+                                                                  predecessors.data(),
+                                                                  h_source_vertex[i],
+                                                                  direction_optimizing,
+                                                                  radius);
+
+    // identify reached vertex ids from distance array
+    thrust::transform(rmm::exec_policy(worker_stream_view),
+                      thrust::make_counting_iterator(vertex_t{0}),
+                      thrust::make_counting_iterator(v),
+                      reached[i].begin(),
+                      reached[i].begin(),
+                      [sentinel = std::numeric_limits<vertex_t>::max()] __device__(
+                        auto id, auto val) { return val < sentinel ? id : sentinel; });
+
+    // removes unreached data
+    auto reached_end = thrust::remove(rmm::exec_policy(worker_stream_view),
+                                      reached[i].begin(),
+                                      reached[i].end(),
+                                      std::numeric_limits<vertex_t>::max());
+    // release temp storage
+    reached[i].resize(thrust::distance(reached[i].begin(), reached_end), worker_stream_view);
+    reached[i].shrink_to_fit(worker_stream_view);
+  }
+
+  // wait on every one to identify their neighboors before proceeding to concatenation
+  handle.wait_on_internal_streams();
+
+  // Construct neighboors offsets (just a scan on neighborhod vector sizes)
+  h_neighbors_offsets[0] = 0;
+  for (vertex_t i = 0; i < n_subgraphs; i++) {
+    h_neighbors_offsets[i + 1] = h_neighbors_offsets[i] + reached[i].size();
+  }
+  raft::update_device(neighbors_offsets.data().get(),
+                      &h_neighbors_offsets[0],
+                      n_subgraphs + 1,
+                      user_stream_view.value());
+  neighbors.resize(h_neighbors_offsets[n_subgraphs]);
+  user_stream_view.synchronize();
+
+  // Construct the neighboors list concurrently
+  for (vertex_t i = 0; i < n_subgraphs; i++) {
+    auto worker_stream_view = handle.get_internal_stream_view(i);
+    thrust::copy(rmm::exec_policy(worker_stream_view),
+                 reached[i].begin(),
+                 reached[i].end(),
+                 neighbors.begin() + h_neighbors_offsets[i]);
+
+    // reached info is not needed anymore
+    reached[i].resize(0, worker_stream_view);
+    reached[i].shrink_to_fit(worker_stream_view);
+  }
+
+  // wait on every one before proceeding to grouped extraction
+  handle.wait_on_internal_streams();
+
+#ifdef TIMING
+  hr_timer.stop();
+  hr_timer.display(std::cout);
+#endif
+
+  // extract
+  return cugraph::experimental::extract_induced_subgraphs(
+    handle, csr_view, neighbors_offsets.data().get(), neighbors.data().get(), n_subgraphs);
+}
+
+}  // namespace
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const& handle,
+            graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+            vertex_t* source_vertex,
+            vertex_t n_subgraphs,
+            vertex_t radius)
+{
+  if (multi_gpu) {
+    CUGRAPH_FAIL("Unimplemented.");
+    return std::make_tuple(rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+                           rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+                           rmm::device_uvector<weight_t>(0, handle.get_stream()),
+                           rmm::device_uvector<size_t>(0, handle.get_stream()));
+  }
+  CUGRAPH_EXPECTS(n_subgraphs > 0, "Need at least one source to extract the egonet from");
+  CUGRAPH_EXPECTS(n_subgraphs < graph_view.get_number_of_vertices(),
+                  "Can't have more sources to extract from than vertices in the graph");
+  CUGRAPH_EXPECTS(radius > 0, "Radius should be at least 1");
+  CUGRAPH_EXPECTS(radius < graph_view.get_number_of_vertices(), "radius is too large");
+  // source_vertex range is checked in bfs.
+
+  return extract<vertex_t, edge_t, weight_t>(
+    handle, graph_view, source_vertex, n_subgraphs, radius);
+}
+
+// SG FP32
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const&,
+            graph_view_t<int32_t, int32_t, float, false, false> const&,
+            int32_t*,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const&,
+            graph_view_t<int32_t, int64_t, float, false, false> const&,
+            int32_t*,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const&,
+            graph_view_t<int64_t, int64_t, float, false, false> const&,
+            int64_t*,
+            int64_t,
+            int64_t);
+
+// SG FP64
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const&,
+            graph_view_t<int32_t, int32_t, double, false, false> const&,
+            int32_t*,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const&,
+            graph_view_t<int32_t, int64_t, double, false, false> const&,
+            int32_t*,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const&,
+            graph_view_t<int64_t, int64_t, double, false, false> const&,
+            int64_t*,
+            int64_t,
+            int64_t);
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/community/extract_subgraph_by_vertex.cu b/cpp/src/community/extract_subgraph_by_vertex.cu
index c39b7f8ad0a..224a3417caf 100644
--- a/cpp/src/community/extract_subgraph_by_vertex.cu
+++ b/cpp/src/community/extract_subgraph_by_vertex.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <algorithms.hpp>
-#include <graph.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/device_atomics.cuh>
@@ -24,9 +24,9 @@
 namespace {
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool has_weight>
-std::unique_ptr<cugraph::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_by_vertices(
-  cugraph::GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
-  vertex_t const *vertices,
+std::unique_ptr<cugraph::legacy::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_by_vertices(
+  cugraph::legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
+  vertex_t const* vertices,
   vertex_t num_vertices,
   cudaStream_t stream)
 {
@@ -35,8 +35,8 @@ std::unique_ptr<cugraph::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_
   rmm::device_vector<int64_t> error_count_v{1, 0};
   rmm::device_vector<vertex_t> vertex_used_v{graph_num_verts, num_vertices};
 
-  vertex_t *d_vertex_used = vertex_used_v.data().get();
-  int64_t *d_error_count  = error_count_v.data().get();
+  vertex_t* d_vertex_used = vertex_used_v.data().get();
+  int64_t* d_error_count  = error_count_v.data().get();
 
   thrust::for_each(
     rmm::exec_policy(stream)->on(stream),
@@ -54,9 +54,9 @@ std::unique_ptr<cugraph::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_
   CUGRAPH_EXPECTS(error_count_v[0] == 0,
                   "Input error... vertices specifies vertex id out of range");
 
-  vertex_t *graph_src    = graph.src_indices;
-  vertex_t *graph_dst    = graph.dst_indices;
-  weight_t *graph_weight = graph.edge_data;
+  vertex_t* graph_src    = graph.src_indices;
+  vertex_t* graph_dst    = graph.dst_indices;
+  weight_t* graph_weight = graph.edge_data;
 
   // iterate over the edges and count how many make it into the output
   int64_t count = thrust::count_if(
@@ -70,12 +70,12 @@ std::unique_ptr<cugraph::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_
     });
 
   if (count > 0) {
-    auto result = std::make_unique<cugraph::GraphCOO<vertex_t, edge_t, weight_t>>(
+    auto result = std::make_unique<cugraph::legacy::GraphCOO<vertex_t, edge_t, weight_t>>(
       num_vertices, count, has_weight);
 
-    vertex_t *d_new_src    = result->src_indices();
-    vertex_t *d_new_dst    = result->dst_indices();
-    weight_t *d_new_weight = result->edge_data();
+    vertex_t* d_new_src    = result->src_indices();
+    vertex_t* d_new_dst    = result->dst_indices();
+    weight_t* d_new_weight = result->edge_data();
 
     //  reusing error_count as a vertex counter...
     thrust::for_each(rmm::exec_policy(stream)->on(stream),
@@ -106,7 +106,8 @@ std::unique_ptr<cugraph::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_
 
     return result;
   } else {
-    return std::make_unique<cugraph::GraphCOO<vertex_t, edge_t, weight_t>>(0, 0, has_weight);
+    return std::make_unique<cugraph::legacy::GraphCOO<vertex_t, edge_t, weight_t>>(
+      0, 0, has_weight);
   }
 }
 }  // namespace
@@ -115,11 +116,10 @@ namespace cugraph {
 namespace subgraph {
 
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, ET, WT> const &graph,
-                                                              VT const *vertices,
-                                                              VT num_vertices)
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> extract_subgraph_vertex(
+  legacy::GraphCOOView<VT, ET, WT> const& graph, VT const* vertices, VT num_vertices)
 {
-  CUGRAPH_EXPECTS(vertices != nullptr, "API error, vertices must be non null");
+  CUGRAPH_EXPECTS(vertices != nullptr, "Invalid input argument: vertices must be non null");
 
   cudaStream_t stream{0};
 
@@ -130,14 +130,12 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, E
   }
 }
 
-template std::unique_ptr<GraphCOO<int32_t, int32_t, float>>
-extract_subgraph_vertex<int32_t, int32_t, float>(GraphCOOView<int32_t, int32_t, float> const &,
-                                                 int32_t const *,
-                                                 int32_t);
-template std::unique_ptr<GraphCOO<int32_t, int32_t, double>>
-extract_subgraph_vertex<int32_t, int32_t, double>(GraphCOOView<int32_t, int32_t, double> const &,
-                                                  int32_t const *,
-                                                  int32_t);
+template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, float>>
+extract_subgraph_vertex<int32_t, int32_t, float>(
+  legacy::GraphCOOView<int32_t, int32_t, float> const&, int32_t const*, int32_t);
+template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, double>>
+extract_subgraph_vertex<int32_t, int32_t, double>(
+  legacy::GraphCOOView<int32_t, int32_t, double> const&, int32_t const*, int32_t);
 
 }  // namespace subgraph
 }  // namespace cugraph
diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh
new file mode 100644
index 00000000000..9a8d214f883
--- /dev/null
+++ b/cpp/src/community/flatten_dendrogram.cuh
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/dendrogram.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+namespace cugraph {
+
+template <typename vertex_t, bool multi_gpu>
+void partition_at_level(raft::handle_t const& handle,
+                        Dendrogram<vertex_t> const& dendrogram,
+                        vertex_t const* d_vertex_ids,
+                        vertex_t* d_partition,
+                        size_t level)
+{
+  vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0);
+  rmm::device_uvector<vertex_t> local_vertex_ids_v(local_num_verts, handle.get_stream());
+
+  raft::copy(d_partition, d_vertex_ids, local_num_verts, handle.get_stream());
+
+  std::for_each(
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(level),
+    [&handle, &dendrogram, &local_vertex_ids_v, d_vertex_ids, &d_partition, local_num_verts](
+      size_t l) {
+      thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       local_vertex_ids_v.begin(),
+                       local_vertex_ids_v.begin() + dendrogram.get_level_size_nocheck(l),
+                       dendrogram.get_level_first_index_nocheck(l));
+
+      cugraph::experimental::relabel<vertex_t, multi_gpu>(
+        handle,
+        std::tuple<vertex_t const*, vertex_t const*>(local_vertex_ids_v.data(),
+                                                     dendrogram.get_level_ptr_nocheck(l)),
+        dendrogram.get_level_size_nocheck(l),
+        d_partition,
+        local_num_verts,
+        false);
+    });
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/ktruss.cu b/cpp/src/community/ktruss.cu
index 11a8ed6fbae..2216278add8 100644
--- a/cpp/src/community/ktruss.cu
+++ b/cpp/src/community/ktruss.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,11 @@
  * @file ktruss.cu
  * --------------------------------------------------------------------------*/
 
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <Hornet.hpp>
 #include <StandardAPI.hpp>
-#include <algorithms.hpp>
+#include <cugraph/algorithms.hpp>
 #include "Static/KTruss/KTruss.cuh"
 
 using namespace hornets_nest;
@@ -35,9 +35,8 @@ namespace cugraph {
 namespace detail {
 
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> ktruss_subgraph_impl(GraphCOOView<VT, ET, WT> const &graph,
-                                                           int k,
-                                                           rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> ktruss_subgraph_impl(
+  legacy::GraphCOOView<VT, ET, WT> const& graph, int k, rmm::mr::device_memory_resource* mr)
 {
   using HornetGraph = hornet::gpu::Hornet<VT>;
   using UpdatePtr   = hornet::BatchUpdatePtr<VT, hornet::EMPTY, hornet::DeviceType::DEVICE>;
@@ -68,7 +67,7 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> ktruss_subgraph_impl(GraphCOOView<VT, ET,
   kt.runForK(k);
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run");
 
-  auto out_graph = std::make_unique<GraphCOO<VT, ET, WT>>(
+  auto out_graph = std::make_unique<legacy::GraphCOO<VT, ET, WT>>(
     graph.number_of_vertices, kt.getGraphEdgeCount(), graph.has_data(), stream, mr);
 
   kt.copyGraph(out_graph->src_indices(), out_graph->dst_indices());
@@ -79,8 +78,8 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> ktruss_subgraph_impl(GraphCOOView<VT, ET,
   return out_graph;
 }
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> weighted_ktruss_subgraph_impl(
-  GraphCOOView<VT, ET, WT> const &graph, int k, rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> weighted_ktruss_subgraph_impl(
+  legacy::GraphCOOView<VT, ET, WT> const& graph, int k, rmm::mr::device_memory_resource* mr)
 {
   using HornetGraph = hornet::gpu::Hornet<VT, hornet::EMPTY, hornet::TypeList<WT>>;
   using UpdatePtr   = hornet::BatchUpdatePtr<VT, hornet::TypeList<WT>, hornet::DeviceType::DEVICE>;
@@ -111,7 +110,7 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> weighted_ktruss_subgraph_impl(
   kt.runForK(k);
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run");
 
-  auto out_graph = std::make_unique<GraphCOO<VT, ET, WT>>(
+  auto out_graph = std::make_unique<legacy::GraphCOO<VT, ET, WT>>(
     graph.number_of_vertices, kt.getGraphEdgeCount(), graph.has_data(), stream, mr);
 
   kt.copyGraph(out_graph->src_indices(), out_graph->dst_indices(), out_graph->edge_data());
@@ -125,9 +124,8 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> weighted_ktruss_subgraph_impl(
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> k_truss_subgraph(GraphCOOView<VT, ET, WT> const &graph,
-                                                       int k,
-                                                       rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> k_truss_subgraph(
+  legacy::GraphCOOView<VT, ET, WT> const& graph, int k, rmm::mr::device_memory_resource* mr)
 {
   CUGRAPH_EXPECTS(graph.src_indices != nullptr, "Graph source indices cannot be a nullptr");
   CUGRAPH_EXPECTS(graph.dst_indices != nullptr, "Graph destination indices cannot be a nullptr");
@@ -139,10 +137,14 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> k_truss_subgraph(GraphCOOView<VT, ET, WT>
   }
 }
 
-template std::unique_ptr<GraphCOO<int32_t, int32_t, float>> k_truss_subgraph<int, int, float>(
-  GraphCOOView<int, int, float> const &, int, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, float>>
+k_truss_subgraph<int, int, float>(legacy::GraphCOOView<int, int, float> const&,
+                                  int,
+                                  rmm::mr::device_memory_resource*);
 
-template std::unique_ptr<GraphCOO<int32_t, int32_t, double>> k_truss_subgraph<int, int, double>(
-  GraphCOOView<int, int, double> const &, int, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, double>>
+k_truss_subgraph<int, int, double>(legacy::GraphCOOView<int, int, double> const&,
+                                   int,
+                                   rmm::mr::device_memory_resource*);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/leiden.cu b/cpp/src/community/leiden.cu
index 9e5a847cdf0..f55321dbebb 100644
--- a/cpp/src/community/leiden.cu
+++ b/cpp/src/community/leiden.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,33 +14,57 @@
  * limitations under the License.
  */
 
+#include <community/flatten_dendrogram.cuh>
 #include <community/leiden.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
-                                   GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                   vertex_t *clustering,
+std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
+                                   legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                                   vertex_t* clustering,
                                    size_t max_level,
                                    weight_t resolution)
 {
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
                   "Invalid input argument: leiden expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
+
+  Leiden<legacy::GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph);
+  weight_t wt = runner(max_level, resolution);
+
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph.number_of_vertices, handle.get_stream());
+
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               thrust::make_counting_iterator<vertex_t>(0),  // MNMG - base vertex id
+               thrust::make_counting_iterator<vertex_t>(
+                 graph.number_of_vertices),  // MNMG - base vertex id + number_of_vertices
+               vertex_ids_v.begin());
 
-  Leiden<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph);
+  partition_at_level<vertex_t, false>(handle,
+                                      runner.get_dendrogram(),
+                                      vertex_ids_v.data(),
+                                      clustering,
+                                      runner.get_dendrogram().num_levels());
 
-  return runner(clustering, max_level, resolution);
+  // FIXME: Consider returning the Dendrogram at some point
+  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
 }
 
 // Explicit template instantations
-template std::pair<size_t, float> leiden(
-  raft::handle_t const &, GraphCSRView<int32_t, int32_t, float> const &, int32_t *, size_t, float);
+template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         legacy::GraphCSRView<int32_t, int32_t, float> const&,
+                                         int32_t*,
+                                         size_t,
+                                         float);
 
-template std::pair<size_t, double> leiden(raft::handle_t const &,
-                                          GraphCSRView<int32_t, int32_t, double> const &,
-                                          int32_t *,
+template std::pair<size_t, double> leiden(raft::handle_t const&,
+                                          legacy::GraphCSRView<int32_t, int32_t, double> const&,
+                                          int32_t*,
                                           size_t,
                                           double);
 
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
index f2f84433284..252fdbf60a7 100644
--- a/cpp/src/community/leiden.cuh
+++ b/cpp/src/community/leiden.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <community/louvain.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 template <typename graph_type>
@@ -27,33 +29,42 @@ class Leiden : public Louvain<graph_type> {
   using edge_t   = typename graph_type::edge_type;
   using weight_t = typename graph_type::weight_type;
 
-  Leiden(raft::handle_t const &handle, graph_type const &graph)
-    : Louvain<graph_type>(handle, graph), constraint_v_(graph.number_of_vertices)
+  Leiden(raft::handle_t const& handle, graph_type const& graph)
+    : Louvain<graph_type>(handle, graph),
+      constraint_v_(graph.number_of_vertices, handle.get_stream())
   {
   }
 
   weight_t update_clustering_constrained(weight_t total_edge_weight,
                                          weight_t resolution,
-                                         graph_type const &graph)
+                                         graph_type const& graph)
   {
     this->timer_start("update_clustering_constrained");
 
-    rmm::device_vector<vertex_t> next_cluster_v(this->cluster_v_);
-    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
-
-    vertex_t const *d_src_indices    = this->src_indices_v_.data().get();
-    vertex_t const *d_dst_indices    = graph.indices;
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    vertex_t *d_cluster              = this->cluster_v_.data().get();
-    weight_t const *d_vertex_weights = this->vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = this->cluster_weights_v_.data().get();
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
-    vertex_t *d_constraint           = constraint_v_.data().get();
-
-    weight_t new_Q =
-      this->modularity(total_edge_weight, resolution, graph, this->cluster_v_.data().get());
+    rmm::device_uvector<vertex_t> next_cluster_v(this->dendrogram_->current_level_size(),
+                                                 this->handle_.get_stream_view());
+    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, this->handle_.get_stream_view());
+    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges,
+                                                 this->handle_.get_stream_view());
+    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices,
+                                                    this->handle_.get_stream_view());
+
+    vertex_t const* d_src_indices    = this->src_indices_v_.data();
+    vertex_t const* d_dst_indices    = graph.indices;
+    vertex_t* d_cluster_hash         = cluster_hash_v.data();
+    vertex_t* d_cluster              = this->dendrogram_->current_level_begin();
+    weight_t const* d_vertex_weights = this->vertex_weights_v_.data();
+    weight_t* d_cluster_weights      = this->cluster_weights_v_.data();
+    weight_t* d_delta_Q              = delta_Q_v.data();
+    vertex_t* d_constraint           = constraint_v_.data();
+
+    thrust::copy(rmm::exec_policy(this->handle_.get_stream_view()),
+                 this->dendrogram_->current_level_begin(),
+                 this->dendrogram_->current_level_end(),
+                 next_cluster_v.data());
+
+    weight_t new_Q = this->modularity(
+      total_edge_weight, resolution, graph, this->dendrogram_->current_level_begin());
 
     weight_t cur_Q = new_Q - 1;
 
@@ -70,7 +81,7 @@ class Leiden : public Louvain<graph_type> {
 
       // Filter out positive delta_Q values for nodes not in the same constraint group
       thrust::for_each(
-        rmm::exec_policy(this->stream_)->on(this->stream_),
+        rmm::exec_policy(this->handle_.get_stream_view()),
         thrust::make_counting_iterator(0),
         thrust::make_counting_iterator(graph.number_of_edges),
         [d_src_indices, d_dst_indices, d_constraint, d_delta_Q] __device__(vertex_t i) {
@@ -83,83 +94,75 @@ class Leiden : public Louvain<graph_type> {
 
       up_down = !up_down;
 
-      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
 
       if (new_Q > cur_Q) {
-        thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
+        thrust::copy(rmm::exec_policy(this->handle_.get_stream_view()),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     this->cluster_v_.begin());
+                     this->dendrogram_->current_level_begin());
       }
     }
 
-    this->timer_stop(this->stream_);
+    this->timer_stop(this->handle_.get_stream_view());
     return cur_Q;
   }
 
-  std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                         size_t max_level,
-                                         weight_t resolution)
+  weight_t operator()(size_t max_level, weight_t resolution) override
   {
     size_t num_level{0};
 
-    weight_t total_edge_weight = thrust::reduce(rmm::exec_policy(this->stream_)->on(this->stream_),
+    weight_t total_edge_weight = thrust::reduce(rmm::exec_policy(this->handle_.get_stream_view()),
                                                 this->weights_v_.begin(),
                                                 this->weights_v_.end());
 
     weight_t best_modularity = weight_t{-1};
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
-                     this->cluster_v_.begin(),
-                     this->cluster_v_.end());
-    thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
-                 this->cluster_v_.begin(),
-                 this->cluster_v_.end(),
-                 d_cluster_vec);
-
     //
     //  Our copy of the graph.  Each iteration of the outer loop will
     //  shrink this copy of the graph.
     //
-    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data().get(),
-                                                           this->indices_v_.data().get(),
-                                                           this->weights_v_.data().get(),
-                                                           this->number_of_vertices_,
-                                                           this->number_of_edges_);
+    legacy::GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data(),
+                                                                   this->indices_v_.data(),
+                                                                   this->weights_v_.data(),
+                                                                   this->number_of_vertices_,
+                                                                   this->number_of_edges_);
 
-    current_graph.get_source_indices(this->src_indices_v_.data().get());
+    current_graph.get_source_indices(this->src_indices_v_.data());
 
     while (num_level < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      this->dendrogram_->add_level(
+        0, current_graph.number_of_vertices, this->handle_.get_stream_view());
+
+      thrust::sequence(rmm::exec_policy(this->handle_.get_stream_view()),
+                       this->dendrogram_->current_level_begin(),
+                       this->dendrogram_->current_level_end());
+
       this->compute_vertex_and_cluster_weights(current_graph);
 
       weight_t new_Q = this->update_clustering(total_edge_weight, resolution, current_graph);
 
-      thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
-                   this->cluster_v_.begin(),
-                   this->cluster_v_.end(),
-                   constraint_v_.begin());
-
       new_Q = update_clustering_constrained(total_edge_weight, resolution, current_graph);
 
       if (new_Q <= best_modularity) { break; }
 
       best_modularity = new_Q;
 
-      this->shrink_graph(current_graph, d_cluster_vec);
+      this->shrink_graph(current_graph);
 
       num_level++;
     }
 
     this->timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  private:
-  rmm::device_vector<vertex_t> constraint_v_;
+  rmm::device_uvector<vertex_t> constraint_v_;
 };
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index 1044211a0ce..c3df4207283 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,161 +14,280 @@
  * limitations under the License.
  */
 
+#include <community/flatten_dendrogram.cuh>
 #include <community/louvain.cuh>
-
-// "FIXME": remove the guards after support for Pascal will be dropped;
-//
-// Disable louvain(experimenta::graph_view_t,...)
-// versions for GPU architectures < 700
-//(this is because cuco/static_map.cuh would not
-// compile on those)
-//
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-#include <experimental/graph.hpp>
-#else
+#include <cugraph/experimental/graph.hpp>
 #include <experimental/louvain.cuh>
-#endif
+
+#include <rmm/device_uvector.hpp>
+
+CUCO_DECLARE_BITWISE_COMPARABLE(float)
+CUCO_DECLARE_BITWISE_COMPARABLE(double)
 
 namespace cugraph {
 
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::pair<size_t, weight_t> louvain(raft::handle_t const &handle,
-                                    GraphCSRView<vertex_t, edge_t, weight_t> const &graph_view,
-                                    vertex_t *clustering,
-                                    size_t max_level,
-                                    weight_t resolution)
+std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph_view,
+  size_t max_level,
+  weight_t resolution)
 {
   CUGRAPH_EXPECTS(graph_view.edge_data != nullptr,
                   "Invalid input argument: louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
 
-  Louvain<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph_view);
-  return runner(clustering, max_level, resolution);
+  Louvain<legacy::GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph_view);
+  weight_t wt = runner(max_level, resolution);
+
+  return std::make_pair(runner.move_dendrogram(), wt);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::pair<size_t, weight_t> louvain(
-  raft::handle_t const &handle,
-  experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
-  vertex_t *clustering,
+std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
+  raft::handle_t const& handle,
+  experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
   size_t max_level,
   weight_t resolution)
 {
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-  CUGRAPH_FAIL("Louvain not supported on Pascal and older architectures");
-#else
   experimental::Louvain<experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu>>
     runner(handle, graph_view);
-  return runner(clustering, max_level, resolution);
-#endif
+
+  weight_t wt = runner(max_level, resolution);
+
+  return std::make_pair(runner.move_dendrogram(), wt);
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void flatten_dendrogram(raft::handle_t const& handle,
+                        legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph_view,
+                        Dendrogram<vertex_t> const& dendrogram,
+                        vertex_t* clustering)
+{
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.number_of_vertices, handle.get_stream());
+
+  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   vertex_ids_v.begin(),
+                   vertex_ids_v.end(),
+                   vertex_t{0});
+
+  partition_at_level<vertex_t, false>(
+    handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels());
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void flatten_dendrogram(
+  raft::handle_t const& handle,
+  experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+  Dendrogram<vertex_t> const& dendrogram,
+  vertex_t* clustering)
+{
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.get_number_of_vertices(),
+                                             handle.get_stream());
+
+  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   vertex_ids_v.begin(),
+                   vertex_ids_v.end(),
+                   graph_view.get_local_vertex_first());
+
+  partition_at_level<vertex_t, multi_gpu>(
+    handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels());
 }
 
 }  // namespace detail
 
-template <typename graph_t>
-std::pair<size_t, typename graph_t::weight_type> louvain(raft::handle_t const &handle,
-                                                         graph_t const &graph,
-                                                         typename graph_t::vertex_type *clustering,
-                                                         size_t max_level,
-                                                         typename graph_t::weight_type resolution)
+template <typename graph_view_t>
+std::pair<std::unique_ptr<Dendrogram<typename graph_view_t::vertex_type>>,
+          typename graph_view_t::weight_type>
+louvain(raft::handle_t const& handle,
+        graph_view_t const& graph_view,
+        size_t max_level,
+        typename graph_view_t::weight_type resolution)
+{
+  return detail::louvain(handle, graph_view, max_level, resolution);
+}
+
+template <typename graph_view_t>
+void flatten_dendrogram(raft::handle_t const& handle,
+                        graph_view_t const& graph_view,
+                        Dendrogram<typename graph_view_t::vertex_type> const& dendrogram,
+                        typename graph_view_t::vertex_type* clustering)
+{
+  detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
+}
+
+template <typename graph_view_t>
+std::pair<size_t, typename graph_view_t::weight_type> louvain(
+  raft::handle_t const& handle,
+  graph_view_t const& graph_view,
+  typename graph_view_t::vertex_type* clustering,
+  size_t max_level,
+  typename graph_view_t::weight_type resolution)
 {
+  using vertex_t = typename graph_view_t::vertex_type;
+  using weight_t = typename graph_view_t::weight_type;
+
   CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-  CUGRAPH_FAIL("Louvain not supported on Pascal and older architectures");
-#else
-  return detail::louvain(handle, graph, clustering, max_level, resolution);
-#endif
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram;
+  weight_t modularity;
+
+  std::tie(dendrogram, modularity) = louvain(handle, graph_view, max_level, resolution);
+
+  flatten_dendrogram(handle, graph_view, *dendrogram, clustering);
+
+  return std::make_pair(dendrogram->num_levels(), modularity);
 }
 
 // Explicit template instantations
-template std::pair<size_t, float> louvain(
-  raft::handle_t const &, GraphCSRView<int32_t, int32_t, float> const &, int32_t *, size_t, float);
-template std::pair<size_t, double> louvain(raft::handle_t const &,
-                                           GraphCSRView<int32_t, int32_t, double> const &,
-                                           int32_t *,
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, float, false, false> const&,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, float, false, false> const&,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, float, false, false> const&,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, double, false, false> const&,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, double, false, false> const&,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, double, false, false> const&,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, float, false, true> const&,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, float, false, true> const&,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, float, false, true> const&,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, double, false, true> const&,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, double, false, true> const&,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, double, false, true> const&,
+  size_t,
+  double);
+
+template std::pair<size_t, float> louvain(raft::handle_t const&,
+                                          legacy::GraphCSRView<int32_t, int32_t, float> const&,
+                                          int32_t*,
+                                          size_t,
+                                          float);
+template std::pair<size_t, double> louvain(raft::handle_t const&,
+                                           legacy::GraphCSRView<int32_t, int32_t, double> const&,
+                                           int32_t*,
                                            size_t,
                                            double);
 template std::pair<size_t, float> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int32_t, float, false, false> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, float, false, false> const&,
+  int32_t*,
   size_t,
   float);
 template std::pair<size_t, double> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int32_t, double, false, false> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, double, false, false> const&,
+  int32_t*,
   size_t,
   double);
 template std::pair<size_t, float> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int64_t, float, false, false> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, float, false, false> const&,
+  int32_t*,
   size_t,
   float);
 template std::pair<size_t, double> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int64_t, double, false, false> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, double, false, false> const&,
+  int32_t*,
   size_t,
   double);
 template std::pair<size_t, float> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int64_t, int64_t, float, false, false> const &,
-  int64_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, float, false, false> const&,
+  int64_t*,
   size_t,
   float);
 template std::pair<size_t, double> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int64_t, int64_t, double, false, false> const &,
-  int64_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, double, false, false> const&,
+  int64_t*,
   size_t,
   double);
 
 // instantations with multi_gpu = true
 template std::pair<size_t, float> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int32_t, float, false, true> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, float, false, true> const&,
+  int32_t*,
   size_t,
   float);
 template std::pair<size_t, double> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int32_t, double, false, true> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int32_t, double, false, true> const&,
+  int32_t*,
   size_t,
   double);
 
 template std::pair<size_t, float> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int64_t, float, false, true> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, float, false, true> const&,
+  int32_t*,
   size_t,
   float);
 template std::pair<size_t, double> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int32_t, int64_t, double, false, true> const &,
-  int32_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int32_t, int64_t, double, false, true> const&,
+  int32_t*,
   size_t,
   double);
 template std::pair<size_t, float> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int64_t, int64_t, float, false, true> const &,
-  int64_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, float, false, true> const&,
+  int64_t*,
   size_t,
   float);
 template std::pair<size_t, double> louvain(
-  raft::handle_t const &,
-  experimental::graph_view_t<int64_t, int64_t, double, false, true> const &,
-  int64_t *,
+  raft::handle_t const&,
+  experimental::graph_view_t<int64_t, int64_t, double, false, true> const&,
+  int64_t*,
   size_t,
   double);
 
 }  // namespace cugraph
 
-#include <eidir_graph.hpp>
+#include <cugraph/legacy/eidir_graph.hpp>
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 7ca3638f42b..31c5a2281ad 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,16 @@
  */
 #pragma once
 
-#include <graph.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
+#include <cugraph/legacy/graph.hpp>
 
 #include <converters/COOtoCSR.cuh>
 #include <utilities/graph_utils.cuh>
 
+#include <cugraph/dendrogram.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
 //#define TIMING
 
 #ifdef TIMING
@@ -38,76 +41,95 @@ class Louvain {
   using edge_t   = typename graph_type::edge_type;
   using weight_t = typename graph_type::weight_type;
 
-  Louvain(raft::handle_t const &handle, graph_type const &graph)
+  Louvain(raft::handle_t const& handle, graph_type const& graph)
     :
 #ifdef TIMING
       hr_timer_(),
 #endif
       handle_(handle),
+      dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
 
       // FIXME:  Don't really need to copy here but would need
       //         to change the logic to populate this properly
       //         in generate_superverticies_graph.
       //
-      offsets_v_(graph.offsets, graph.offsets + graph.number_of_vertices + 1),
-      indices_v_(graph.indices, graph.indices + graph.number_of_edges),
-      weights_v_(graph.edge_data, graph.edge_data + graph.number_of_edges),
-      src_indices_v_(graph.number_of_edges),
-      vertex_weights_v_(graph.number_of_vertices),
-      cluster_weights_v_(graph.number_of_vertices),
-      cluster_v_(graph.number_of_vertices),
-      tmp_arr_v_(graph.number_of_vertices),
-      cluster_inverse_v_(graph.number_of_vertices),
+      offsets_v_(graph.number_of_vertices + 1, handle.get_stream_view()),
+      indices_v_(graph.number_of_edges, handle.get_stream_view()),
+      weights_v_(graph.number_of_edges, handle.get_stream_view()),
+      src_indices_v_(graph.number_of_edges, handle.get_stream_view()),
+      vertex_weights_v_(graph.number_of_vertices, handle.get_stream_view()),
+      cluster_weights_v_(graph.number_of_vertices, handle.get_stream_view()),
+      tmp_arr_v_(graph.number_of_vertices, handle.get_stream_view()),
+      cluster_inverse_v_(graph.number_of_vertices, handle.get_stream_view()),
       number_of_vertices_(graph.number_of_vertices),
-      number_of_edges_(graph.number_of_edges),
-      stream_(handle.get_stream())
+      number_of_edges_(graph.number_of_edges)
   {
+    thrust::copy(rmm::exec_policy(handle_.get_stream_view()),
+                 graph.offsets,
+                 graph.offsets + graph.number_of_vertices + 1,
+                 offsets_v_.begin());
+
+    thrust::copy(rmm::exec_policy(handle_.get_stream_view()),
+                 graph.indices,
+                 graph.indices + graph.number_of_edges,
+                 indices_v_.begin());
+
+    thrust::copy(rmm::exec_policy(handle_.get_stream_view()),
+                 graph.edge_data,
+                 graph.edge_data + graph.number_of_edges,
+                 weights_v_.begin());
   }
 
+  virtual ~Louvain() {}
+
   weight_t modularity(weight_t total_edge_weight,
                       weight_t resolution,
-                      graph_t const &graph,
-                      vertex_t const *d_cluster)
+                      graph_t const& graph,
+                      vertex_t const* d_cluster)
   {
     vertex_t n_verts = graph.number_of_vertices;
 
-    rmm::device_vector<weight_t> inc(n_verts, weight_t{0.0});
-    rmm::device_vector<weight_t> deg(n_verts, weight_t{0.0});
+    rmm::device_uvector<weight_t> inc(n_verts, handle_.get_stream_view());
+    rmm::device_uvector<weight_t> deg(n_verts, handle_.get_stream_view());
 
-    edge_t const *d_offsets   = graph.offsets;
-    vertex_t const *d_indices = graph.indices;
-    weight_t const *d_weights = graph.edge_data;
-    weight_t *d_inc           = inc.data().get();
-    weight_t *d_deg           = deg.data().get();
+    thrust::fill(
+      rmm::exec_policy(handle_.get_stream_view()), inc.begin(), inc.end(), weight_t{0.0});
+    thrust::fill(
+      rmm::exec_policy(handle_.get_stream_view()), deg.begin(), deg.end(), weight_t{0.0});
 
     // FIXME:  Already have weighted degree computed in main loop,
     //         could pass that in rather than computing d_deg... which
     //         would save an atomicAdd (synchronization)
     //
-    thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_inc, d_deg, d_offsets, d_indices, d_weights, d_cluster] __device__(vertex_t v) {
-        vertex_t community = d_cluster[v];
-        weight_t increase{0.0};
-        weight_t degree{0.0};
-
-        for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
-          vertex_t neighbor = d_indices[loc];
-          degree += d_weights[loc];
-          if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
-        }
+    thrust::for_each(rmm::exec_policy(handle_.get_stream_view()),
+                     thrust::make_counting_iterator(0),
+                     thrust::make_counting_iterator(graph.number_of_vertices),
+                     [d_inc     = inc.data(),
+                      d_deg     = deg.data(),
+                      d_offsets = graph.offsets,
+                      d_indices = graph.indices,
+                      d_weights = graph.edge_data,
+                      d_cluster] __device__(vertex_t v) {
+                       vertex_t community = d_cluster[v];
+                       weight_t increase{0.0};
+                       weight_t degree{0.0};
+
+                       for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
+                         vertex_t neighbor = d_indices[loc];
+                         degree += d_weights[loc];
+                         if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
+                       }
 
-        if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
-        if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
-      });
+                       if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
+                       if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
+                     });
 
     weight_t Q = thrust::transform_reduce(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream_view()),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_deg, d_inc, total_edge_weight, resolution] __device__(vertex_t community) {
+      [d_deg = deg.data(), d_inc = inc.data(), total_edge_weight, resolution] __device__(
+        vertex_t community) {
         return ((d_inc[community] / total_edge_weight) - resolution *
                                                            (d_deg[community] * d_deg[community]) /
                                                            (total_edge_weight * total_edge_weight));
@@ -118,37 +140,37 @@ class Louvain {
     return Q;
   }
 
-  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                                 size_t max_level,
-                                                 weight_t resolution)
-  {
-    size_t num_level{0};
+  Dendrogram<vertex_t> const& get_dendrogram() const { return *dendrogram_; }
 
-    weight_t total_edge_weight =
-      thrust::reduce(rmm::exec_policy(stream_)->on(stream_), weights_v_.begin(), weights_v_.end());
+  Dendrogram<vertex_t>& get_dendrogram() { return *dendrogram_; }
 
-    weight_t best_modularity = weight_t{-1};
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return std::move(dendrogram_); }
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
-    thrust::copy(
-      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
+  virtual weight_t operator()(size_t max_level, weight_t resolution)
+  {
+    weight_t total_edge_weight = thrust::reduce(
+      rmm::exec_policy(handle_.get_stream_view()), weights_v_.begin(), weights_v_.end());
+
+    weight_t best_modularity = weight_t{-1};
 
     //
     //  Our copy of the graph.  Each iteration of the outer loop will
     //  shrink this copy of the graph.
     //
-    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data().get(),
-                                                           indices_v_.data().get(),
-                                                           weights_v_.data().get(),
-                                                           number_of_vertices_,
-                                                           number_of_edges_);
+    legacy::GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data(),
+                                                                   indices_v_.data(),
+                                                                   weights_v_.data(),
+                                                                   number_of_vertices_,
+                                                                   number_of_edges_);
 
-    current_graph.get_source_indices(src_indices_v_.data().get());
+    current_graph.get_source_indices(src_indices_v_.data());
+
+    while (dendrogram_->num_levels() < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      initialize_dendrogram_level(current_graph.number_of_vertices);
 
-    while (num_level < max_level) {
       compute_vertex_and_cluster_weights(current_graph);
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution, current_graph);
@@ -157,55 +179,62 @@ class Louvain {
 
       best_modularity = new_Q;
 
-      shrink_graph(current_graph, d_cluster_vec);
-
-      num_level++;
+      shrink_graph(current_graph);
     }
 
     timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  protected:
-  void timer_start(std::string const &region)
+  void timer_start(std::string const& region)
   {
 #ifdef TIMING
     hr_timer_.start(region);
 #endif
   }
 
-  void timer_stop(cudaStream_t stream)
+  void timer_stop(rmm::cuda_stream_view stream_view)
   {
 #ifdef TIMING
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream_view.synchronize();
     hr_timer_.stop();
 #endif
   }
 
-  void timer_display(std::ostream &os)
+  void timer_display(std::ostream& os)
   {
 #ifdef TIMING
     hr_timer_.display(os);
 #endif
   }
 
+  virtual void initialize_dendrogram_level(vertex_t num_vertices)
+  {
+    dendrogram_->add_level(0, num_vertices, handle_.get_stream_view());
+
+    thrust::sequence(rmm::exec_policy(handle_.get_stream_view()),
+                     dendrogram_->current_level_begin(),
+                     dendrogram_->current_level_end());
+  }
+
  public:
-  void compute_vertex_and_cluster_weights(graph_type const &graph)
+  void compute_vertex_and_cluster_weights(graph_type const& graph)
   {
     timer_start("compute_vertex_and_cluster_weights");
 
-    edge_t const *d_offsets     = graph.offsets;
-    vertex_t const *d_indices   = graph.indices;
-    weight_t const *d_weights   = graph.edge_data;
-    weight_t *d_vertex_weights  = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights = cluster_weights_v_.data().get();
+    edge_t const* d_offsets     = graph.offsets;
+    vertex_t const* d_indices   = graph.indices;
+    weight_t const* d_weights   = graph.edge_data;
+    weight_t* d_vertex_weights  = vertex_weights_v_.data();
+    weight_t* d_cluster_weights = cluster_weights_v_.data();
 
     //
     // MNMG:  copy_v_transform_reduce_out_nbr, then copy
     //
     thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream_view()),
       thrust::make_counting_iterator<edge_t>(0),
       thrust::make_counting_iterator<edge_t>(graph.number_of_vertices),
       [d_offsets, d_indices, d_weights, d_vertex_weights, d_cluster_weights] __device__(
@@ -217,30 +246,34 @@ class Louvain {
         d_cluster_weights[src] = sum;
       });
 
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream_view());
   }
 
   virtual weight_t update_clustering(weight_t total_edge_weight,
                                      weight_t resolution,
-                                     graph_type const &graph)
+                                     graph_type const& graph)
   {
     timer_start("update_clustering");
 
-    //
-    // MNMG: This is the hard one, see writeup
-    //
-    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
-    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
+    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(),
+                                                 handle_.get_stream_view());
+    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, handle_.get_stream_view());
+    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges, handle_.get_stream_view());
+    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices,
+                                                    handle_.get_stream_view());
 
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    vertex_t *d_cluster              = cluster_v_.data().get();
-    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
+    vertex_t* d_cluster              = dendrogram_->current_level_begin();
+    weight_t const* d_vertex_weights = vertex_weights_v_.data();
+    weight_t* d_cluster_weights      = cluster_weights_v_.data();
+    weight_t* d_delta_Q              = delta_Q_v.data();
 
-    weight_t new_Q = modularity(total_edge_weight, resolution, graph, cluster_v_.data().get());
+    thrust::copy(rmm::exec_policy(handle_.get_stream_view()),
+                 dendrogram_->current_level_begin(),
+                 dendrogram_->current_level_end(),
+                 next_cluster_v.data());
+
+    weight_t new_Q =
+      modularity(total_edge_weight, resolution, graph, dendrogram_->current_level_begin());
 
     weight_t cur_Q = new_Q - 1;
 
@@ -259,62 +292,56 @@ class Louvain {
 
       up_down = !up_down;
 
-      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
 
       if (new_Q > cur_Q) {
-        thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+        thrust::copy(rmm::exec_policy(handle_.get_stream_view()),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     cluster_v_.begin());
+                     dendrogram_->current_level_begin());
       }
     }
 
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream_view());
     return cur_Q;
   }
 
   void compute_delta_modularity(weight_t total_edge_weight,
                                 weight_t resolution,
-                                graph_type const &graph,
-                                rmm::device_vector<vertex_t> &cluster_hash_v,
-                                rmm::device_vector<weight_t> &old_cluster_sum_v,
-                                rmm::device_vector<weight_t> &delta_Q_v)
+                                graph_type const& graph,
+                                rmm::device_uvector<vertex_t>& cluster_hash_v,
+                                rmm::device_uvector<weight_t>& old_cluster_sum_v,
+                                rmm::device_uvector<weight_t>& delta_Q_v)
   {
-    vertex_t const *d_src_indices     = src_indices_v_.data().get();
-    vertex_t const *d_dst_indices     = graph.indices;
-    edge_t const *d_offsets           = graph.offsets;
-    weight_t const *d_weights         = graph.edge_data;
-    vertex_t const *d_cluster         = cluster_v_.data().get();
-    weight_t const *d_vertex_weights  = vertex_weights_v_.data().get();
-    weight_t const *d_cluster_weights = cluster_weights_v_.data().get();
-
-    vertex_t *d_cluster_hash    = cluster_hash_v.data().get();
-    weight_t *d_delta_Q         = delta_Q_v.data().get();
-    weight_t *d_old_cluster_sum = old_cluster_sum_v.data().get();
-    weight_t *d_new_cluster_sum = d_delta_Q;
-
-    thrust::fill(cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1});
-    thrust::fill(delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
-    thrust::fill(old_cluster_sum_v.begin(), old_cluster_sum_v.end(), weight_t{0.0});
-
-    // MNMG:  New technique using reduce_by_key.  Would require a segmented sort
-    //        or a pair of sorts on each node, so probably slower than what's here.
-    //        This might still be faster even in MNMG...
-    //
-    //
-    // FIXME:  Eventually this should use cuCollections concurrent map
-    //         implementation, but that won't be available for a while.
-    //
-    // For each source vertex, we're going to build a hash
-    // table to the destination cluster ids.  We can use
-    // the offsets ranges to define the bounds of the hash
-    // table.
-    //
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+    edge_t const* d_offsets           = graph.offsets;
+    weight_t const* d_weights         = graph.edge_data;
+    vertex_t const* d_cluster         = dendrogram_->current_level_begin();
+    weight_t const* d_vertex_weights  = vertex_weights_v_.data();
+    weight_t const* d_cluster_weights = cluster_weights_v_.data();
+
+    vertex_t* d_cluster_hash    = cluster_hash_v.data();
+    weight_t* d_delta_Q         = delta_Q_v.data();
+    weight_t* d_old_cluster_sum = old_cluster_sum_v.data();
+    weight_t* d_new_cluster_sum = d_delta_Q;
+
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 cluster_hash_v.begin(),
+                 cluster_hash_v.end(),
+                 vertex_t{-1});
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 delta_Q_v.begin(),
+                 delta_Q_v.end(),
+                 weight_t{0.0});
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 old_cluster_sum_v.begin(),
+                 old_cluster_sum_v.end(),
+                 weight_t{0.0});
+
+    thrust::for_each(rmm::exec_policy(handle_.get_stream_view()),
                      thrust::make_counting_iterator<edge_t>(0),
                      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [d_src_indices,
-                      d_dst_indices,
+                     [d_src_indices = src_indices_v_.data(),
+                      d_dst_indices = graph.indices,
                       d_cluster,
                       d_offsets,
                       d_cluster_hash,
@@ -349,13 +376,13 @@ class Louvain {
                      });
 
     thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream_view()),
       thrust::make_counting_iterator<edge_t>(0),
       thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
       [total_edge_weight,
        resolution,
        d_cluster_hash,
-       d_src_indices,
+       d_src_indices = src_indices_v_.data(),
        d_cluster,
        d_vertex_weights,
        d_delta_Q,
@@ -382,34 +409,41 @@ class Louvain {
       });
   }
 
-  void assign_nodes(graph_type const &graph,
-                    rmm::device_vector<vertex_t> &cluster_hash_v,
-                    rmm::device_vector<vertex_t> &next_cluster_v,
-                    rmm::device_vector<weight_t> &delta_Q_v,
+  void assign_nodes(graph_type const& graph,
+                    rmm::device_uvector<vertex_t>& cluster_hash_v,
+                    rmm::device_uvector<vertex_t>& next_cluster_v,
+                    rmm::device_uvector<weight_t>& delta_Q_v,
                     bool up_down)
   {
-    rmm::device_vector<vertex_t> temp_vertices_v(graph.number_of_vertices);
-    rmm::device_vector<vertex_t> temp_cluster_v(graph.number_of_vertices, vertex_t{-1});
-    rmm::device_vector<weight_t> temp_delta_Q_v(graph.number_of_vertices, weight_t{0.0});
-
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
-    vertex_t *d_next_cluster         = next_cluster_v.data().get();
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
+    rmm::device_uvector<vertex_t> temp_vertices_v(graph.number_of_vertices,
+                                                  handle_.get_stream_view());
+    rmm::device_uvector<vertex_t> temp_cluster_v(graph.number_of_vertices,
+                                                 handle_.get_stream_view());
+    rmm::device_uvector<weight_t> temp_delta_Q_v(graph.number_of_vertices,
+                                                 handle_.get_stream_view());
+
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 temp_cluster_v.begin(),
+                 temp_cluster_v.end(),
+                 vertex_t{-1});
+
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 temp_delta_Q_v.begin(),
+                 temp_delta_Q_v.end(),
+                 weight_t{0});
 
     auto cluster_reduce_iterator =
-      thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q));
+      thrust::make_zip_iterator(thrust::make_tuple(cluster_hash_v.begin(), delta_Q_v.begin()));
 
-    auto output_edge_iterator2 = thrust::make_zip_iterator(
-      thrust::make_tuple(temp_cluster_v.data().get(), temp_delta_Q_v.data().get()));
+    auto output_edge_iterator2 =
+      thrust::make_zip_iterator(thrust::make_tuple(temp_cluster_v.begin(), temp_delta_Q_v.begin()));
 
     auto cluster_reduce_end =
-      thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
+      thrust::reduce_by_key(rmm::exec_policy(handle_.get_stream_view()),
                             src_indices_v_.begin(),
                             src_indices_v_.end(),
                             cluster_reduce_iterator,
-                            temp_vertices_v.data().get(),
+                            temp_vertices_v.data(),
                             output_edge_iterator2,
                             thrust::equal_to<vertex_t>(),
                             [] __device__(auto pair1, auto pair2) {
@@ -422,22 +456,18 @@ class Louvain {
                                 return pair2;
                             });
 
-    vertex_t final_size = thrust::distance(temp_vertices_v.data().get(), cluster_reduce_end.first);
-
-    vertex_t *d_temp_vertices = temp_vertices_v.data().get();
-    vertex_t *d_temp_clusters = temp_cluster_v.data().get();
-    weight_t *d_temp_delta_Q  = temp_delta_Q_v.data().get();
+    vertex_t final_size = thrust::distance(temp_vertices_v.data(), cluster_reduce_end.first);
 
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+    thrust::for_each(rmm::exec_policy(handle_.get_stream_view()),
                      thrust::make_counting_iterator<vertex_t>(0),
                      thrust::make_counting_iterator<vertex_t>(final_size),
-                     [d_temp_delta_Q,
-                      up_down,
-                      d_next_cluster,
-                      d_temp_vertices,
-                      d_vertex_weights,
-                      d_temp_clusters,
-                      d_cluster_weights] __device__(vertex_t id) {
+                     [up_down,
+                      d_temp_delta_Q    = temp_delta_Q_v.data(),
+                      d_next_cluster    = next_cluster_v.data(),
+                      d_temp_vertices   = temp_vertices_v.data(),
+                      d_vertex_weights  = vertex_weights_v_.data(),
+                      d_temp_clusters   = temp_cluster_v.data(),
+                      d_cluster_weights = cluster_weights_v_.data()] __device__(vertex_t id) {
                        if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) {
                          vertex_t new_cluster = d_temp_clusters[id];
                          vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]];
@@ -453,187 +483,176 @@ class Louvain {
                      });
   }
 
-  void shrink_graph(graph_t &graph, vertex_t *d_cluster_vec)
+  void shrink_graph(graph_t& graph)
   {
     timer_start("shrinking graph");
 
     // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(d_cluster_vec);
-    cluster_weights_v_.resize(num_clusters);
+    vertex_t num_clusters = renumber_clusters();
+    cluster_weights_v_.resize(num_clusters, handle_.get_stream_view());
 
     // shrink our graph to represent the graph of supervertices
     generate_superverticies_graph(graph, num_clusters);
 
-    // assign each new vertex to its own cluster
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
-
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream_view());
   }
 
-  vertex_t renumber_clusters(vertex_t *d_cluster_vec)
+  vertex_t renumber_clusters()
   {
-    vertex_t *d_tmp_array       = tmp_arr_v_.data().get();
-    vertex_t *d_cluster_inverse = cluster_inverse_v_.data().get();
-    vertex_t *d_cluster         = cluster_v_.data().get();
+    vertex_t* d_tmp_array       = tmp_arr_v_.data();
+    vertex_t* d_cluster_inverse = cluster_inverse_v_.data();
+    vertex_t* d_cluster         = dendrogram_->current_level_begin();
 
-    vertex_t old_num_clusters = cluster_v_.size();
+    vertex_t old_num_clusters = dendrogram_->current_level_size();
 
     //
     //  New technique.  Initialize cluster_inverse_v_ to 0
     //
-    thrust::fill(cluster_inverse_v_.begin(), cluster_inverse_v_.end(), vertex_t{0});
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 cluster_inverse_v_.begin(),
+                 cluster_inverse_v_.end(),
+                 vertex_t{0});
 
     //
-    // Iterate over every element c in cluster_v_ and set cluster_inverse_v to 1
+    // Iterate over every element c in the current clustering and set cluster_inverse_v to 1
     //
     auto first_1 = thrust::make_constant_iterator<vertex_t>(1);
     auto last_1  = first_1 + old_num_clusters;
 
-    thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
+    thrust::scatter(rmm::exec_policy(handle_.get_stream_view()),
                     first_1,
                     last_1,
-                    cluster_v_.begin(),
+                    dendrogram_->current_level_begin(),
                     cluster_inverse_v_.begin());
 
     //
     // Now we'll copy all of the clusters that have a value of 1 into a temporary array
     //
     auto copy_end = thrust::copy_if(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream_view()),
       thrust::make_counting_iterator<vertex_t>(0),
       thrust::make_counting_iterator<vertex_t>(old_num_clusters),
       tmp_arr_v_.begin(),
       [d_cluster_inverse] __device__(const vertex_t idx) { return d_cluster_inverse[idx] == 1; });
 
     vertex_t new_num_clusters = thrust::distance(tmp_arr_v_.begin(), copy_end);
-    tmp_arr_v_.resize(new_num_clusters);
+    tmp_arr_v_.resize(new_num_clusters, handle_.get_stream_view());
 
     //
     // Now we can set each value in cluster_inverse of a cluster to its index
     //
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+    thrust::for_each(rmm::exec_policy(handle_.get_stream_view()),
                      thrust::make_counting_iterator<vertex_t>(0),
                      thrust::make_counting_iterator<vertex_t>(new_num_clusters),
                      [d_cluster_inverse, d_tmp_array] __device__(const vertex_t idx) {
                        d_cluster_inverse[d_tmp_array[idx]] = idx;
                      });
 
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+    thrust::for_each(rmm::exec_policy(handle_.get_stream_view()),
                      thrust::make_counting_iterator<vertex_t>(0),
                      thrust::make_counting_iterator<vertex_t>(old_num_clusters),
                      [d_cluster, d_cluster_inverse] __device__(vertex_t i) {
                        d_cluster[i] = d_cluster_inverse[d_cluster[i]];
                      });
 
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
-                     thrust::make_counting_iterator<vertex_t>(0),
-                     thrust::make_counting_iterator<vertex_t>(number_of_vertices_),
-                     [d_cluster_vec, d_cluster] __device__(vertex_t i) {
-                       d_cluster_vec[i] = d_cluster[d_cluster_vec[i]];
-                     });
-
-    cluster_inverse_v_.resize(new_num_clusters);
-    cluster_v_.resize(new_num_clusters);
+    cluster_inverse_v_.resize(new_num_clusters, handle_.get_stream_view());
 
     return new_num_clusters;
   }
 
-  void generate_superverticies_graph(graph_t &graph, vertex_t num_clusters)
+  void generate_superverticies_graph(graph_t& graph, vertex_t num_clusters)
   {
-    rmm::device_vector<vertex_t> new_src_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> new_dst_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> new_weight_v(graph.number_of_edges);
-
-    vertex_t *d_old_src    = src_indices_v_.data().get();
-    vertex_t *d_old_dst    = graph.indices;
-    weight_t *d_old_weight = graph.edge_data;
-    vertex_t *d_new_src    = new_src_v.data().get();
-    vertex_t *d_new_dst    = new_dst_v.data().get();
-    vertex_t *d_clusters   = cluster_v_.data().get();
-    weight_t *d_new_weight = new_weight_v.data().get();
+    rmm::device_uvector<vertex_t> new_src_v(graph.number_of_edges, handle_.get_stream_view());
+    rmm::device_uvector<vertex_t> new_dst_v(graph.number_of_edges, handle_.get_stream_view());
+    rmm::device_uvector<weight_t> new_weight_v(graph.number_of_edges, handle_.get_stream_view());
 
     //
     //  Renumber the COO
     //
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+    thrust::for_each(rmm::exec_policy(handle_.get_stream_view()),
                      thrust::make_counting_iterator<edge_t>(0),
                      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [d_old_src,
-                      d_old_dst,
-                      d_old_weight,
-                      d_new_src,
-                      d_new_dst,
-                      d_new_weight,
-                      d_clusters] __device__(edge_t e) {
+                     [d_old_src    = src_indices_v_.data(),
+                      d_old_dst    = graph.indices,
+                      d_old_weight = graph.edge_data,
+                      d_new_src    = new_src_v.data(),
+                      d_new_dst    = new_dst_v.data(),
+                      d_new_weight = new_weight_v.data(),
+                      d_clusters   = dendrogram_->current_level_begin()] __device__(edge_t e) {
                        d_new_src[e]    = d_clusters[d_old_src[e]];
                        d_new_dst[e]    = d_clusters[d_old_dst[e]];
                        d_new_weight[e] = d_old_weight[e];
                      });
 
     thrust::stable_sort_by_key(
-      rmm::exec_policy(stream_)->on(stream_),
-      d_new_dst,
-      d_new_dst + graph.number_of_edges,
-      thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_weight)));
+      rmm::exec_policy(handle_.get_stream_view()),
+      new_dst_v.begin(),
+      new_dst_v.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_weight_v.begin())));
     thrust::stable_sort_by_key(
-      rmm::exec_policy(stream_)->on(stream_),
-      d_new_src,
-      d_new_src + graph.number_of_edges,
-      thrust::make_zip_iterator(thrust::make_tuple(d_new_dst, d_new_weight)));
+      rmm::exec_policy(handle_.get_stream_view()),
+      new_src_v.begin(),
+      new_src_v.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(new_dst_v.begin(), new_weight_v.begin())));
 
     //
     //  Now we reduce by key to combine the weights of duplicate
     //  edges.
     //
-    auto start     = thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_dst));
-    auto new_start = thrust::make_zip_iterator(thrust::make_tuple(d_old_src, d_old_dst));
-    auto new_end   = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
+    auto start =
+      thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_dst_v.begin()));
+    auto new_start =
+      thrust::make_zip_iterator(thrust::make_tuple(src_indices_v_.data(), graph.indices));
+    auto new_end = thrust::reduce_by_key(rmm::exec_policy(handle_.get_stream_view()),
                                          start,
                                          start + graph.number_of_edges,
-                                         d_new_weight,
+                                         new_weight_v.begin(),
                                          new_start,
-                                         d_old_weight,
+                                         graph.edge_data,
                                          thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
                                          thrust::plus<weight_t>());
 
     graph.number_of_edges    = thrust::distance(new_start, new_end.first);
     graph.number_of_vertices = num_clusters;
 
-    detail::fill_offset(d_old_src, graph.offsets, num_clusters, graph.number_of_edges, stream_);
-    CHECK_CUDA(stream_);
+    detail::fill_offset(src_indices_v_.data(),
+                        graph.offsets,
+                        num_clusters,
+                        graph.number_of_edges,
+                        handle_.get_stream_view());
 
-    src_indices_v_.resize(graph.number_of_edges);
-    indices_v_.resize(graph.number_of_edges);
-    weights_v_.resize(graph.number_of_edges);
+    src_indices_v_.resize(graph.number_of_edges, handle_.get_stream_view());
+    indices_v_.resize(graph.number_of_edges, handle_.get_stream_view());
+    weights_v_.resize(graph.number_of_edges, handle_.get_stream_view());
   }
 
  protected:
-  raft::handle_t const &handle_;
+  raft::handle_t const& handle_;
   vertex_t number_of_vertices_;
   edge_t number_of_edges_;
-  cudaStream_t stream_;
+
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
 
   //
   //  Copy of graph
   //
-  rmm::device_vector<edge_t> offsets_v_;
-  rmm::device_vector<vertex_t> indices_v_;
-  rmm::device_vector<weight_t> weights_v_;
-  rmm::device_vector<vertex_t> src_indices_v_;
+  rmm::device_uvector<edge_t> offsets_v_;
+  rmm::device_uvector<vertex_t> indices_v_;
+  rmm::device_uvector<weight_t> weights_v_;
+  rmm::device_uvector<vertex_t> src_indices_v_;
 
   //
   //  Weights and clustering across iterations of algorithm
   //
-  rmm::device_vector<weight_t> vertex_weights_v_;
-  rmm::device_vector<weight_t> cluster_weights_v_;
-  rmm::device_vector<vertex_t> cluster_v_;
+  rmm::device_uvector<weight_t> vertex_weights_v_;
+  rmm::device_uvector<weight_t> cluster_weights_v_;
 
   //
   //  Temporaries used within kernels.  Each iteration uses less
   //  of this memory
   //
-  rmm::device_vector<vertex_t> tmp_arr_v_;
-  rmm::device_vector<vertex_t> cluster_inverse_v_;
+  rmm::device_uvector<vertex_t> tmp_arr_v_;
+  rmm::device_uvector<vertex_t> cluster_inverse_v_;
 
 #ifdef TIMING
   HighResTimer hr_timer_;
diff --git a/cpp/src/community/spectral_clustering.cu b/cpp/src/community/spectral_clustering.cu
index f32739ddf29..4dd27a56b70 100644
--- a/cpp/src/community/spectral_clustering.cu
+++ b/cpp/src/community/spectral_clustering.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,14 @@
  * @file spectral_clustering.cu
  * ---------------------------------------------------------------------------**/
 
-#include <algorithms.hpp>
+#include <cugraph/algorithms.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <thrust/transform.h>
 #include <ctime>
 
-#include <graph.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <raft/spectral/modularity_maximization.hpp>
 #include <raft/spectral/partition.hpp>
@@ -39,16 +39,16 @@ namespace ext_raft {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void balancedCutClustering_impl(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void balancedCutClustering_impl(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
                                 vertex_t n_clusters,
                                 vertex_t n_eig_vects,
                                 weight_t evs_tolerance,
                                 int evs_max_iter,
                                 weight_t kmean_tolerance,
                                 int kmean_max_iter,
-                                vertex_t *clustering,
-                                weight_t *eig_vals,
-                                weight_t *eig_vects)
+                                vertex_t* clustering,
+                                weight_t* eig_vals,
+                                weight_t* eig_vects)
 {
   RAFT_EXPECTS(graph.edge_data != nullptr, "API error, graph must have weights");
   RAFT_EXPECTS(evs_tolerance >= weight_t{0.0},
@@ -109,16 +109,17 @@ void balancedCutClustering_impl(GraphCSRView<vertex_t, edge_t, weight_t> const &
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void spectralModularityMaximization_impl(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                         vertex_t n_clusters,
-                                         vertex_t n_eig_vects,
-                                         weight_t evs_tolerance,
-                                         int evs_max_iter,
-                                         weight_t kmean_tolerance,
-                                         int kmean_max_iter,
-                                         vertex_t *clustering,
-                                         weight_t *eig_vals,
-                                         weight_t *eig_vects)
+void spectralModularityMaximization_impl(
+  legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  vertex_t n_clusters,
+  vertex_t n_eig_vects,
+  weight_t evs_tolerance,
+  int evs_max_iter,
+  weight_t kmean_tolerance,
+  int kmean_max_iter,
+  vertex_t* clustering,
+  weight_t* eig_vals,
+  weight_t* eig_vects)
 {
   RAFT_EXPECTS(graph.edge_data != nullptr, "API error, graph must have weights");
   RAFT_EXPECTS(evs_tolerance >= weight_t{0.0},
@@ -186,10 +187,10 @@ void spectralModularityMaximization_impl(GraphCSRView<vertex_t, edge_t, weight_t
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void analyzeModularityClustering_impl(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void analyzeModularityClustering_impl(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
                                       int n_clusters,
-                                      vertex_t const *clustering,
-                                      weight_t *modularity)
+                                      vertex_t const* clustering,
+                                      weight_t* modularity)
 {
   raft::handle_t handle;
   auto stream  = handle.get_stream();
@@ -207,11 +208,11 @@ void analyzeModularityClustering_impl(GraphCSRView<vertex_t, edge_t, weight_t> c
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void analyzeBalancedCut_impl(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void analyzeBalancedCut_impl(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
                              vertex_t n_clusters,
-                             vertex_t const *clustering,
-                             weight_t *edgeCut,
-                             weight_t *ratioCut)
+                             vertex_t const* clustering,
+                             weight_t* edgeCut,
+                             weight_t* ratioCut)
 {
   raft::handle_t handle;
   auto stream  = handle.get_stream();
@@ -240,14 +241,14 @@ void analyzeBalancedCut_impl(GraphCSRView<vertex_t, edge_t, weight_t> const &gra
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-void balancedCutClustering(GraphCSRView<VT, ET, WT> const &graph,
+void balancedCutClustering(legacy::GraphCSRView<VT, ET, WT> const& graph,
                            VT num_clusters,
                            VT num_eigen_vects,
                            WT evs_tolerance,
                            int evs_max_iter,
                            WT kmean_tolerance,
                            int kmean_max_iter,
-                           VT *clustering)
+                           VT* clustering)
 {
   rmm::device_vector<WT> eig_vals(num_eigen_vects);
   rmm::device_vector<WT> eig_vects(num_eigen_vects * graph.number_of_vertices);
@@ -265,14 +266,14 @@ void balancedCutClustering(GraphCSRView<VT, ET, WT> const &graph,
 }
 
 template <typename VT, typename ET, typename WT>
-void spectralModularityMaximization(GraphCSRView<VT, ET, WT> const &graph,
+void spectralModularityMaximization(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                     VT n_clusters,
                                     VT n_eigen_vects,
                                     WT evs_tolerance,
                                     int evs_max_iter,
                                     WT kmean_tolerance,
                                     int kmean_max_iter,
-                                    VT *clustering)
+                                    VT* clustering)
 {
   rmm::device_vector<WT> eig_vals(n_eigen_vects);
   rmm::device_vector<WT> eig_vects(n_eigen_vects * graph.number_of_vertices);
@@ -290,66 +291,54 @@ void spectralModularityMaximization(GraphCSRView<VT, ET, WT> const &graph,
 }
 
 template <typename VT, typename ET, typename WT>
-void analyzeClustering_modularity(GraphCSRView<VT, ET, WT> const &graph,
+void analyzeClustering_modularity(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                   int n_clusters,
-                                  VT const *clustering,
-                                  WT *score)
+                                  VT const* clustering,
+                                  WT* score)
 {
   detail::analyzeModularityClustering_impl(graph, n_clusters, clustering, score);
 }
 
 template <typename VT, typename ET, typename WT>
-void analyzeClustering_edge_cut(GraphCSRView<VT, ET, WT> const &graph,
+void analyzeClustering_edge_cut(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                 int n_clusters,
-                                VT const *clustering,
-                                WT *score)
+                                VT const* clustering,
+                                WT* score)
 {
   WT dummy{0.0};
   detail::analyzeBalancedCut_impl(graph, n_clusters, clustering, score, &dummy);
 }
 
 template <typename VT, typename ET, typename WT>
-void analyzeClustering_ratio_cut(GraphCSRView<VT, ET, WT> const &graph,
+void analyzeClustering_ratio_cut(legacy::GraphCSRView<VT, ET, WT> const& graph,
                                  int n_clusters,
-                                 VT const *clustering,
-                                 WT *score)
+                                 VT const* clustering,
+                                 WT* score)
 {
   WT dummy{0.0};
   detail::analyzeBalancedCut_impl(graph, n_clusters, clustering, &dummy, score);
 }
 
 template void balancedCutClustering<int, int, float>(
-  GraphCSRView<int, int, float> const &, int, int, float, int, float, int, int *);
+  legacy::GraphCSRView<int, int, float> const&, int, int, float, int, float, int, int*);
 template void balancedCutClustering<int, int, double>(
-  GraphCSRView<int, int, double> const &, int, int, double, int, double, int, int *);
+  legacy::GraphCSRView<int, int, double> const&, int, int, double, int, double, int, int*);
 template void spectralModularityMaximization<int, int, float>(
-  GraphCSRView<int, int, float> const &, int, int, float, int, float, int, int *);
+  legacy::GraphCSRView<int, int, float> const&, int, int, float, int, float, int, int*);
 template void spectralModularityMaximization<int, int, double>(
-  GraphCSRView<int, int, double> const &, int, int, double, int, double, int, int *);
-template void analyzeClustering_modularity<int, int, float>(GraphCSRView<int, int, float> const &,
-                                                            int,
-                                                            int const *,
-                                                            float *);
-template void analyzeClustering_modularity<int, int, double>(GraphCSRView<int, int, double> const &,
-                                                             int,
-                                                             int const *,
-                                                             double *);
-template void analyzeClustering_edge_cut<int, int, float>(GraphCSRView<int, int, float> const &,
-                                                          int,
-                                                          int const *,
-                                                          float *);
-template void analyzeClustering_edge_cut<int, int, double>(GraphCSRView<int, int, double> const &,
-                                                           int,
-                                                           int const *,
-                                                           double *);
-template void analyzeClustering_ratio_cut<int, int, float>(GraphCSRView<int, int, float> const &,
-                                                           int,
-                                                           int const *,
-                                                           float *);
-template void analyzeClustering_ratio_cut<int, int, double>(GraphCSRView<int, int, double> const &,
-                                                            int,
-                                                            int const *,
-                                                            double *);
+  legacy::GraphCSRView<int, int, double> const&, int, int, double, int, double, int, int*);
+template void analyzeClustering_modularity<int, int, float>(
+  legacy::GraphCSRView<int, int, float> const&, int, int const*, float*);
+template void analyzeClustering_modularity<int, int, double>(
+  legacy::GraphCSRView<int, int, double> const&, int, int const*, double*);
+template void analyzeClustering_edge_cut<int, int, float>(
+  legacy::GraphCSRView<int, int, float> const&, int, int const*, float*);
+template void analyzeClustering_edge_cut<int, int, double>(
+  legacy::GraphCSRView<int, int, double> const&, int, int const*, double*);
+template void analyzeClustering_ratio_cut<int, int, float>(
+  legacy::GraphCSRView<int, int, float> const&, int, int const*, float*);
+template void analyzeClustering_ratio_cut<int, int, double>(
+  legacy::GraphCSRView<int, int, double> const&, int, int const*, double*);
 
 }  // namespace ext_raft
 }  // namespace cugraph
diff --git a/cpp/src/community/triangles_counting.cu b/cpp/src/community/triangles_counting.cu
index f6670365652..97543d28c62 100644
--- a/cpp/src/community/triangles_counting.cu
+++ b/cpp/src/community/triangles_counting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@
 #include <cuda_runtime.h>
 
 #include <raft/cudart_utils.h>
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <rmm/device_buffer.hpp>
@@ -31,8 +31,8 @@
 #include "cub/cub.cuh"
 
 #define TH_CENT_K_LOCLEN (34)
-#define WP_LEN_TH1 (24)
-#define WP_LEN_TH2 (2)
+#define WP_LEN_TH1       (24)
+#define WP_LEN_TH2       (2)
 
 #if WP_LEN_TH1 > 32
 #error WP_LEN_TH1 must be <= 32!
@@ -41,9 +41,9 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
-#define THREADS (128)
+#define THREADS      (128)
 #define DIV_UP(a, b) (((a) + ((b)-1)) / (b))
-#define BITSOF(x) (sizeof(*x) * 8)
+#define BITSOF(x)    (sizeof(*x) * 8)
 
 #define BLK_BWL0 (128)
 
@@ -73,9 +73,9 @@ struct spmat_t {
   T N;
   T nnz;
   T nrows;
-  const T *roff_d;
-  const T *rows_d;
-  const T *cols_d;
+  const T* roff_d;
+  const T* rows_d;
+  const T* cols_d;
   bool is_lower_triangular;
 };
 
@@ -173,7 +173,9 @@ __device__ __forceinline__ T block_sum(T v)
   const int wid = threadIdx.x / 32 + ((BDIM_Y > 1) ? threadIdx.y * (BDIM_X / 32) : 0);
 
 #pragma unroll
-  for (int i = WSIZE / 2; i; i >>= 1) { v += __shfl_down_sync(raft::warp_full_mask(), v, i); }
+  for (int i = WSIZE / 2; i; i >>= 1) {
+    v += __shfl_down_sync(raft::warp_full_mask(), v, i);
+  }
   if (lid == 0) sh[wid] = v;
 
   __syncthreads();
@@ -197,13 +199,13 @@ template <int BDIM,
           typename CNT_T,
           typename MAP_T>
 __global__ void tricnt_b2b_k(const ROW_T ner,
-                             const ROW_T *__restrict__ rows,
-                             const OFF_T *__restrict__ roff,
-                             const ROW_T *__restrict__ cols,
-                             CNT_T *__restrict__ ocnt,
-                             MAP_T *__restrict__ bmapL0,
+                             const ROW_T* __restrict__ rows,
+                             const OFF_T* __restrict__ roff,
+                             const ROW_T* __restrict__ cols,
+                             CNT_T* __restrict__ ocnt,
+                             MAP_T* __restrict__ bmapL0,
                              const size_t bmldL0,
-                             MAP_T *__restrict__ bmapL1,
+                             MAP_T* __restrict__ bmapL1,
                              const size_t bmldL1)
 {
   CNT_T __cnt = 0;
@@ -277,11 +279,11 @@ __global__ void tricnt_b2b_k(const ROW_T ner,
 
 template <typename T>
 void tricnt_b2b(T nblock,
-                spmat_t<T> *m,
-                uint64_t *ocnt_d,
-                unsigned int *bmapL0_d,
+                spmat_t<T>* m,
+                uint64_t* ocnt_d,
+                unsigned int* bmapL0_d,
                 size_t bmldL0,
-                unsigned int *bmapL1_d,
+                unsigned int* bmapL1_d,
                 size_t bmldL1,
                 cudaStream_t stream)
 {
@@ -294,13 +296,15 @@ void tricnt_b2b(T nblock,
 
 //////////////////////////////////////////////////////////////////////////////////////////
 template <int BDIM_X, int BDIM_Y, int WSIZE, typename T>
-__device__ __forceinline__ T block_sum_sh(T v, T *sh)
+__device__ __forceinline__ T block_sum_sh(T v, T* sh)
 {
   const int lid = threadIdx.x % 32;
   const int wid = threadIdx.x / 32 + ((BDIM_Y > 1) ? threadIdx.y * (BDIM_X / 32) : 0);
 
 #pragma unroll
-  for (int i = WSIZE / 2; i; i >>= 1) { v += __shfl_down_sync(raft::warp_full_mask(), v, i); }
+  for (int i = WSIZE / 2; i; i >>= 1) {
+    v += __shfl_down_sync(raft::warp_full_mask(), v, i);
+  }
   if (lid == 0) sh[wid] = v;
 
   __syncthreads();
@@ -317,10 +321,10 @@ __device__ __forceinline__ T block_sum_sh(T v, T *sh)
 
 template <int BDIM, int WSIZE, typename ROW_T, typename OFF_T, typename CNT_T>
 __global__ void tricnt_bsh_k(const ROW_T ner,
-                             const ROW_T *__restrict__ rows,
-                             const OFF_T *__restrict__ roff,
-                             const ROW_T *__restrict__ cols,
-                             CNT_T *__restrict__ ocnt,
+                             const ROW_T* __restrict__ rows,
+                             const OFF_T* __restrict__ roff,
+                             const ROW_T* __restrict__ cols,
+                             CNT_T* __restrict__ ocnt,
                              const size_t bmld)
 {
   CNT_T __cnt = 0;
@@ -374,7 +378,7 @@ __global__ void tricnt_bsh_k(const ROW_T ner,
     __syncthreads();
     if (lastcol - firstcol < rend - rbeg) {
       for (int i = firstcol; i <= lastcol; i += BDIM) {
-        if (i + threadIdx.x <= lastcol) { ((unsigned long long *)shm)[i + threadIdx.x] = 0ull; }
+        if (i + threadIdx.x <= lastcol) { ((unsigned long long*)shm)[i + threadIdx.x] = 0ull; }
       }
     } else {
       for (int i = rbeg; i < rend; i += BDIM) {
@@ -383,14 +387,14 @@ __global__ void tricnt_bsh_k(const ROW_T ner,
     }
     __syncthreads();
   }
-  __cnt = block_sum_sh<BDIM, 1, WSIZE>(__cnt, (uint64_t *)shm);
+  __cnt = block_sum_sh<BDIM, 1, WSIZE>(__cnt, (uint64_t*)shm);
   if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt;
 
   return;
 }
 
 template <typename T>
-void tricnt_bsh(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, size_t bmld, cudaStream_t stream)
+void tricnt_bsh(T nblock, spmat_t<T>* m, uint64_t* ocnt_d, size_t bmld, cudaStream_t stream)
 {
   tricnt_bsh_k<THREADS, 32><<<nblock, THREADS, sizeof(unsigned int) * bmld, stream>>>(
     m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmld);
@@ -408,11 +412,11 @@ template <int WSIZE,
           typename CNT_T,
           typename MAP_T>
 __global__ void tricnt_wrp_ps_k(const ROW_T ner,
-                                const ROW_T *__restrict__ rows,
-                                const OFF_T *__restrict__ roff,
-                                const ROW_T *__restrict__ cols,
-                                CNT_T *__restrict__ ocnt,
-                                MAP_T *__restrict__ bmap,
+                                const ROW_T* __restrict__ rows,
+                                const OFF_T* __restrict__ roff,
+                                const ROW_T* __restrict__ cols,
+                                CNT_T* __restrict__ ocnt,
+                                MAP_T* __restrict__ bmap,
                                 const size_t bmld)
 {
   __shared__ OFF_T sho[NWARP][WSIZE];
@@ -520,7 +524,7 @@ __global__ void tricnt_wrp_ps_k(const ROW_T ner,
 
       if (lastcol - firstcol < rend - rbeg) {
         for (int i = firstcol; i <= lastcol; i += WSIZE) {
-          if (i + threadIdx.x <= lastcol) { ((unsigned long long *)bmap)[i + threadIdx.x] = 0ull; }
+          if (i + threadIdx.x <= lastcol) { ((unsigned long long*)bmap)[i + threadIdx.x] = 0ull; }
         }
       } else {
         for (int i = rbeg; i < rend; i += WSIZE) {
@@ -537,7 +541,7 @@ __global__ void tricnt_wrp_ps_k(const ROW_T ner,
 
 template <typename T>
 void tricnt_wrp(
-  T nblock, spmat_t<T> *m, uint64_t *ocnt_d, unsigned int *bmap_d, size_t bmld, cudaStream_t stream)
+  T nblock, spmat_t<T>* m, uint64_t* ocnt_d, unsigned int* bmap_d, size_t bmld, cudaStream_t stream)
 {
   dim3 block(32, THREADS / 32);
   tricnt_wrp_ps_k<32, THREADS / 32, WP_LEN_TH1, WP_LEN_TH2>
@@ -549,10 +553,10 @@ void tricnt_wrp(
 //////////////////////////////////////////////////////////////////////////////////////////
 template <int BDIM, int LOCLEN, typename ROW_T, typename OFF_T, typename CNT_T>
 __global__ void tricnt_thr_k(const ROW_T ner,
-                             const ROW_T *__restrict__ rows,
-                             const OFF_T *__restrict__ roff,
-                             const ROW_T *__restrict__ cols,
-                             CNT_T *__restrict__ ocnt)
+                             const ROW_T* __restrict__ rows,
+                             const OFF_T* __restrict__ roff,
+                             const ROW_T* __restrict__ cols,
+                             CNT_T* __restrict__ ocnt)
 {
   CNT_T __cnt     = 0;
   const ROW_T tid = blockIdx.x * BDIM + threadIdx.x;
@@ -619,7 +623,7 @@ __global__ void tricnt_thr_k(const ROW_T ner,
 }
 
 template <typename T>
-void tricnt_thr(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, cudaStream_t stream)
+void tricnt_thr(T nblock, spmat_t<T>* m, uint64_t* ocnt_d, cudaStream_t stream)
 {
   cudaFuncSetCacheConfig(tricnt_thr_k<THREADS,
                                       TH_CENT_K_LOCLEN,
@@ -637,9 +641,9 @@ void tricnt_thr(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, cudaStream_t stream)
 /////////////////////////////////////////////////////////////////
 template <typename IndexType>
 struct NonEmptyRow {
-  const IndexType *p_roff;
-  __host__ __device__ NonEmptyRow(const IndexType *roff) : p_roff(roff) {}
-  __host__ __device__ __forceinline__ bool operator()(const IndexType &a) const
+  const IndexType* p_roff;
+  __host__ __device__ NonEmptyRow(const IndexType* roff) : p_roff(roff) {}
+  __host__ __device__ __forceinline__ bool operator()(const IndexType& a) const
   {
     return (p_roff[a] < p_roff[a + 1]);
   }
@@ -647,7 +651,7 @@ struct NonEmptyRow {
 
 template <typename T>
 void create_nondangling_vector(
-  const T *roff, T *p_nonempty, T *n_nonempty, size_t n, cudaStream_t stream)
+  const T* roff, T* p_nonempty, T* n_nonempty, size_t n, cudaStream_t stream)
 {
   if (n <= 0) return;
   thrust::counting_iterator<T> it(0);
@@ -660,7 +664,7 @@ void create_nondangling_vector(
 }
 
 template <typename T>
-uint64_t reduce(uint64_t *v_d, T n, cudaStream_t stream)
+uint64_t reduce(uint64_t* v_d, T n, cudaStream_t stream)
 {
   rmm::device_vector<uint64_t> tmp(1);
 
@@ -694,8 +698,8 @@ class TrianglesCount {
   // Simple constructor
   TrianglesCount(IndexType num_vertices,
                  IndexType num_edges,
-                 IndexType const *row_offsets,
-                 IndexType const *col_indices,
+                 IndexType const* row_offsets,
+                 IndexType const* col_indices,
                  cudaStream_t stream = NULL);
 
   void count();
@@ -705,8 +709,8 @@ class TrianglesCount {
 template <typename IndexType>
 TrianglesCount<IndexType>::TrianglesCount(IndexType num_vertices,
                                           IndexType num_edges,
-                                          IndexType const *row_offsets,
-                                          IndexType const *col_indices,
+                                          IndexType const* row_offsets,
+                                          IndexType const* col_indices,
                                           cudaStream_t stream)
   : m_mat{num_vertices, num_edges, num_vertices, row_offsets, nullptr, col_indices},
     m_stream{stream},
@@ -759,8 +763,9 @@ void TrianglesCount<IndexType>::tcount_b2b()
   cudaMemGetInfo(&free_bytes, &total_bytes);
   CHECK_CUDA(m_stream);
 
-  int nblock = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmldL1);  //@TODO: what?
-  nblock     = MIN(nblock, m_mat.nrows);
+  size_t nblock_available = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmldL1);
+
+  int nblock = static_cast<int>(MIN(nblock_available, static_cast<size_t>(m_mat.nrows)));
 
   // allocate level 1 bitmap
   rmm::device_vector<uint32_t> bmapL1_d(bmldL1 * nblock, uint32_t{0});
@@ -793,8 +798,10 @@ void TrianglesCount<IndexType>::tcount_wrp()
   cudaMemGetInfo(&free_bytes, &total_bytes);
   CHECK_CUDA(m_stream);
 
-  int nblock = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmld * (THREADS / 32));
-  nblock     = MIN(nblock, DIV_UP(m_mat.nrows, (THREADS / 32)));
+  size_t nblock_available = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmld * (THREADS / 32));
+
+  int nblock = static_cast<int>(
+    MIN(nblock_available, static_cast<size_t>(DIV_UP(m_mat.nrows, (THREADS / 32)))));
 
   size_t bmap_sz = bmld * nblock * (THREADS / 32);
 
@@ -827,7 +834,8 @@ void TrianglesCount<IndexType>::count()
     tcount_wrp();
   else {
     const int shMinBlkXSM = 6;
-    if (size_t{m_shared_mem_per_block * 8 / shMinBlkXSM} < (size_t)m_mat.N)
+    if (static_cast<size_t>(m_shared_mem_per_block * 8 / shMinBlkXSM) <
+        static_cast<size_t>(m_mat.N))
       tcount_b2b();
     else
       tcount_bsh();
@@ -837,7 +845,7 @@ void TrianglesCount<IndexType>::count()
 }  // namespace
 
 template <typename VT, typename ET, typename WT>
-uint64_t triangle_count(GraphCSRView<VT, ET, WT> const &graph)
+uint64_t triangle_count(legacy::GraphCSRView<VT, ET, WT> const& graph)
 {
   TrianglesCount<VT> counter(
     graph.number_of_vertices, graph.number_of_edges, graph.offsets, graph.indices);
@@ -847,7 +855,7 @@ uint64_t triangle_count(GraphCSRView<VT, ET, WT> const &graph)
 }
 
 template uint64_t triangle_count<int32_t, int32_t, float>(
-  GraphCSRView<int32_t, int32_t, float> const &);
+  legacy::GraphCSRView<int32_t, int32_t, float> const&);
 
 }  // namespace triangle
 }  // namespace cugraph
diff --git a/cpp/src/components/connectivity.cu b/cpp/src/components/connectivity.cu
index 2cc1da017a9..85134c1ad67 100644
--- a/cpp/src/components/connectivity.cu
+++ b/cpp/src/components/connectivity.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,13 +19,13 @@
 
 #include <thrust/sequence.h>
 
-#include <algorithms.hpp>
 #include <cstdint>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 #include <iostream>
 #include <type_traits>
-#include "utilities/error.hpp"
-#include "utilities/graph_utils.cuh"
+#include <utilities/graph_utils.cuh>
 
 #include "topology/topology.cuh"
 
@@ -57,15 +57,15 @@ namespace detail {
  */
 template <typename VT, typename ET, typename WT, int TPB_X = 32>
 std::enable_if_t<std::is_signed<VT>::value> connected_components_impl(
-  GraphCSRView<VT, ET, WT> const &graph,
+  legacy::GraphCSRView<VT, ET, WT> const& graph,
   cugraph_cc_t connectivity_type,
-  VT *labels,
+  VT* labels,
   cudaStream_t stream)
 {
   using ByteT = unsigned char;  // minimum addressable unit
 
-  CUGRAPH_EXPECTS(graph.offsets != nullptr, "Invalid API parameter: graph.offsets is nullptr");
-  CUGRAPH_EXPECTS(graph.indices != nullptr, "Invalid API parameter: graph.indices is nullptr");
+  CUGRAPH_EXPECTS(graph.offsets != nullptr, "Invalid input argument: graph.offsets is nullptr");
+  CUGRAPH_EXPECTS(graph.indices != nullptr, "Invalid input argument: graph.indices is nullptr");
 
   VT nrows = graph.number_of_vertices;
 
@@ -78,26 +78,26 @@ std::enable_if_t<std::is_signed<VT>::value> connected_components_impl(
                                                    stream);
   } else {
     SCC_Data<ByteT, VT> sccd(nrows, graph.offsets, graph.indices);
-    sccd.run_scc(labels);
+    auto num_iters = sccd.run_scc(labels);
   }
 }
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-void connected_components(GraphCSRView<VT, ET, WT> const &graph,
+void connected_components(legacy::GraphCSRView<VT, ET, WT> const& graph,
                           cugraph_cc_t connectivity_type,
-                          VT *labels)
+                          VT* labels)
 {
   cudaStream_t stream{nullptr};
 
-  CUGRAPH_EXPECTS(labels != nullptr, "Invalid API parameter: labels parameter is NULL");
+  CUGRAPH_EXPECTS(labels != nullptr, "Invalid input argument: labels parameter is NULL");
 
   return detail::connected_components_impl<VT, ET, WT>(graph, connectivity_type, labels, stream);
 }
 
 template void connected_components<int32_t, int32_t, float>(
-  GraphCSRView<int32_t, int32_t, float> const &, cugraph_cc_t, int32_t *);
+  legacy::GraphCSRView<int32_t, int32_t, float> const&, cugraph_cc_t, int32_t*);
 template void connected_components<int64_t, int64_t, float>(
-  GraphCSRView<int64_t, int64_t, float> const &, cugraph_cc_t, int64_t *);
+  legacy::GraphCSRView<int64_t, int64_t, float> const&, cugraph_cc_t, int64_t*);
 
 }  // namespace cugraph
diff --git a/cpp/src/components/scc_matrix.cuh b/cpp/src/components/scc_matrix.cuh
index 801f1fe0fad..c7f4506b74e 100644
--- a/cpp/src/components/scc_matrix.cuh
+++ b/cpp/src/components/scc_matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,12 +71,13 @@ struct SCC_Data {
       p_d_r_o_(p_d_r_o),
       p_d_c_i_(p_d_c_i),
       d_C(nrows * nrows, 0),
-      d_Cprev(nrows * nrows, 0)
+      d_Cprev(nrows * nrows, 0),
+      p_d_C_(d_C.data().get())
   {
     init();
   }
 
-  const thrust::device_vector<ByteT>& get_C(void) const { return d_C; }
+  ByteT const* get_Cptr(void) const { return p_d_C_; }
 
   size_t nrows(void) const { return nrows_; }
 
@@ -100,13 +101,12 @@ struct SCC_Data {
 
   void get_labels(IndexT* d_labels) const
   {
-    auto* p_d_C = d_C.data().get();
-    size_t n    = nrows_;  // for lambda capture, since I cannot capture `this` (host), or `nrows_`
+    size_t n = nrows_;  // for lambda capture, since I cannot capture `this` (host), or `nrows_`
     thrust::transform(thrust::device,
                       thrust::make_counting_iterator<IndexT>(0),
                       thrust::make_counting_iterator<IndexT>(nrows_),
                       d_labels,
-                      [n, p_d_C] __device__(IndexT k) {
+                      [n, p_d_C = p_d_C_] __device__(IndexT k) {
                         auto begin = p_d_C + k * n;
                         auto end   = begin + n;
                         ByteT one{1};
@@ -124,7 +124,6 @@ struct SCC_Data {
     size_t nrows = nrows_;
     size_t count = 0;
 
-    ByteT* p_d_C     = d_C.data().get();
     ByteT* p_d_Cprev = get_Cprev().data().get();
 
     size_t n2            = nrows * nrows;
@@ -136,57 +135,60 @@ struct SCC_Data {
     do {
       flag.set(0);
 
-      thrust::for_each(thrust::device,
-                       thrust::make_counting_iterator<size_t>(0),
-                       thrust::make_counting_iterator<size_t>(n2),
-                       [nrows, p_d_C, p_d_Cprev, p_d_flag, p_d_ro, p_d_ci] __device__(size_t indx) {
-                         ByteT one{1};
-
-                         auto i = indx / nrows;
-                         auto j = indx % nrows;
-
-                         if ((i == j) || (p_d_Cprev[indx] == one))
-                           p_d_C[indx] = one;
-                         else {
-                           // this is where a hash-map could help:
-                           // only need hashmap[(i,j)]={0,1} (`1` for "hit");
-                           // and only for new entries!
-                           // already existent entries are covered by
-                           // the `if`-branch above!
-                           // Hence, hashmap[] can use limited space:
-                           // M = max_l{number(new `1` entries)}, where
-                           // l = #iterations in the do-loop!
-                           // M ~ new `1` entries between A^k and A^{k+1},
-                           //    k=1,2,...
-                           // Might M actually be M ~ nnz(A) = |E| ?!
-                           // Probably, because the primitive hash
-                           //(via find_if) uses a search space of nnz(A)
-                           //
-                           // But, what if more than 1 entry pops-up in a row?
-                           // Not an issue! Because the hash key is (i,j), and no
-                           // more than one entry can exist in position (i,j)!
-                           //
-                           // And remember, we only need to store the new (i,j) keys
-                           // that an iteration produces wrt to the previous iteration!
-                           //
-                           auto begin = p_d_ci + p_d_ro[i];
-                           auto end   = p_d_ci + p_d_ro[i + 1];
-                           auto pos   = thrust::find_if(
-                             thrust::seq, begin, end, [one, j, nrows, p_d_Cprev, p_d_ci](IndexT k) {
-                               return (p_d_Cprev[k * nrows + j] == one);
-                             });
-
-                           if (pos != end) p_d_C[indx] = one;
-                         }
-
-                         if (p_d_C[indx] != p_d_Cprev[indx])
-                           *p_d_flag = 1;  // race-condition: harmless, worst case many threads
-                                           // write the same value
-                       });
+      thrust::for_each(
+        thrust::device,
+        thrust::make_counting_iterator<size_t>(0),
+        thrust::make_counting_iterator<size_t>(n2),
+        [nrows, p_d_C = p_d_C_, p_d_Cprev, p_d_flag, p_d_ro, p_d_ci] __device__(size_t indx) {
+          ByteT one{1};
+
+          auto i = indx / nrows;
+          auto j = indx % nrows;
+
+          if ((i == j) || (p_d_Cprev[indx] == one)) {
+            p_d_C[indx] = one;
+          } else {
+            // this ammounts to A (^,v) B
+            // (where A = adjacency matrix defined by (p_ro, p_ci),
+            //  B := p_d_Cprev; (^,v) := (*,+) semiring);
+            // Here's why:
+            // (A (^,v) B)[i][j] := A[i][.] (^,v) B[j][.]
+            // (where X[i][.] := i-th row of X;
+            //        X[.][j] := j-th column of X);
+            // which is:
+            // 1, iff A[i][.] and B[j][.] have a 1 in the same location,
+            // 0, otherwise;
+            //
+            // i.e., corresponfing entry in p_d_C is 1
+            // if B[k][j] == 1 for any column k in A's i-th row;
+            // hence, for each column k of row A[i][.],
+            // which is the set:
+            // k \in {p_ci + p_ro[i], ..., p_ci + p_ro[i+1] - 1},
+            // check if (B[k][j] == 1),
+            // i.e., p_d_Cprev[k*nrows + j]) == 1:
+            //
+            auto begin = p_d_ci + p_d_ro[i];
+            auto end   = p_d_ci + p_d_ro[i + 1];
+            auto pos   = thrust::find_if(
+              thrust::seq, begin, end, [one, j, nrows, p_d_Cprev, p_d_ci](IndexT k) {
+                return (p_d_Cprev[k * nrows + j] == one);
+              });
+
+            if (pos != end) p_d_C[indx] = one;
+          }
+
+          if (p_d_C[indx] != p_d_Cprev[indx])
+            *p_d_flag = 1;  // race-condition: harmless,
+                            // worst case many threads
+                            // write the _same_ value
+        });
       ++count;
       cudaDeviceSynchronize();
 
-      std::swap(p_d_C, p_d_Cprev);
+      std::swap(p_d_C_, p_d_Cprev);  // Note 1: this swap makes `p_d_Cprev` the
+                                     // most recently updated matrix pointer
+                                     // at the end of this loop
+                                     // (see `Note 2` why this matters);
     } while (flag.is_set());
 
     // C & Ct:
@@ -196,11 +198,13 @@ struct SCC_Data {
     thrust::for_each(thrust::device,
                      thrust::make_counting_iterator<size_t>(0),
                      thrust::make_counting_iterator<size_t>(n2),
-                     [nrows, p_d_C, p_d_Cprev] __device__(size_t indx) {
+                     [nrows, p_d_C = p_d_C_, p_d_Cprev] __device__(size_t indx) {
                        auto i     = indx / nrows;
                        auto j     = indx % nrows;
                        auto tindx = j * nrows + i;
 
+                       // Note 2: per Note 1, p_d_Cprev is latest:
+                       //
                        p_d_C[indx] = (p_d_Cprev[indx]) & (p_d_Cprev[tindx]);
                      });
 
@@ -215,6 +219,9 @@ struct SCC_Data {
   const IndexT* p_d_c_i_;  // column indices
   thrust::device_vector<ByteT> d_C;
   thrust::device_vector<ByteT> d_Cprev;
+  ByteT* p_d_C_{nullptr};  // holds the most recent update,
+  // which can have storage in any of d_C or d_Cprev,
+  // because the pointers get swapped!
 
   thrust::device_vector<ByteT>& get_Cprev(void) { return d_Cprev; }
 };
diff --git a/cpp/src/components/utils.h b/cpp/src/components/utils.h
index c9ebb6ac4d1..4e7dbe075da 100644
--- a/cpp/src/components/utils.h
+++ b/cpp/src/components/utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 
 #include <raft/cudart_utils.h>
 
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
 
 namespace MLCommon {
 
@@ -73,7 +73,9 @@ class Exception : public std::exception {
       return;
     }
     ///@todo: support for demangling of C++ symbol names
-    for (int i = 0; i < depth; ++i) { oss << "#" << i << " in " << strings.get()[i] << std::endl; }
+    for (int i = 0; i < depth; ++i) {
+      oss << "#" << i << " in " << strings.get()[i] << std::endl;
+    }
     msg += oss.str();
 #endif  // __GNUC__
   }
diff --git a/cpp/src/components/weak_cc.cuh b/cpp/src/components/weak_cc.cuh
index d644a988117..e0da23c2ae8 100644
--- a/cpp/src/components/weak_cc.cuh
+++ b/cpp/src/components/weak_cc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,22 +47,22 @@ namespace Sparse {
 
 class WeakCCState {
  public:
-  bool *xa;
-  bool *fa;
-  bool *m;
+  bool* xa;
+  bool* fa;
+  bool* m;
   bool owner;
 
-  WeakCCState(bool *xa, bool *fa, bool *m) : xa(xa), fa(fa), m(m) {}
+  WeakCCState(bool* xa, bool* fa, bool* m) : xa(xa), fa(fa), m(m) {}
 };
 
 template <typename vertex_t, typename edge_t, int TPB_X = 32>
-__global__ void weak_cc_label_device(vertex_t *labels,
-                                     edge_t const *offsets,
-                                     vertex_t const *indices,
+__global__ void weak_cc_label_device(vertex_t* labels,
+                                     edge_t const* offsets,
+                                     vertex_t const* indices,
                                      edge_t nnz,
-                                     bool *fa,
-                                     bool *xa,
-                                     bool *m,
+                                     bool* fa,
+                                     bool* xa,
+                                     bool* m,
                                      vertex_t startVertexId,
                                      vertex_t batchSize)
 {
@@ -115,7 +115,7 @@ __global__ void weak_cc_label_device(vertex_t *labels,
 }
 
 template <typename vertex_t, int TPB_X = 32, typename Lambda>
-__global__ void weak_cc_init_label_kernel(vertex_t *labels,
+__global__ void weak_cc_init_label_kernel(vertex_t* labels,
                                           vertex_t startVertexId,
                                           vertex_t batchSize,
                                           vertex_t MAX_LABEL,
@@ -132,7 +132,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t *labels,
 
 template <typename vertex_t, int TPB_X = 32>
 __global__ void weak_cc_init_all_kernel(
-  vertex_t *labels, bool *fa, bool *xa, vertex_t N, vertex_t MAX_LABEL)
+  vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
@@ -143,12 +143,12 @@ __global__ void weak_cc_init_all_kernel(
 }
 
 template <typename vertex_t, typename edge_t, int TPB_X = 32, typename Lambda>
-void weak_cc_label_batched(vertex_t *labels,
-                           edge_t const *offsets,
-                           vertex_t const *indices,
+void weak_cc_label_batched(vertex_t* labels,
+                           edge_t const* offsets,
+                           vertex_t const* indices,
                            edge_t nnz,
                            vertex_t N,
-                           WeakCCState &state,
+                           WeakCCState& state,
                            vertex_t startVertexId,
                            vertex_t batchSize,
                            cudaStream_t stream,
@@ -217,14 +217,14 @@ template <typename vertex_t,
           typename edge_t,
           int TPB_X       = 32,
           typename Lambda = auto(vertex_t)->bool>
-void weak_cc_batched(vertex_t *labels,
-                     edge_t const *offsets,
-                     vertex_t const *indices,
+void weak_cc_batched(vertex_t* labels,
+                     edge_t const* offsets,
+                     vertex_t const* indices,
                      edge_t nnz,
                      vertex_t N,
                      vertex_t startVertexId,
                      vertex_t batchSize,
-                     WeakCCState &state,
+                     WeakCCState& state,
                      cudaStream_t stream,
                      Lambda filter_op)
 {
@@ -273,9 +273,9 @@ template <typename vertex_t,
           typename edge_t,
           int TPB_X       = 32,
           typename Lambda = auto(vertex_t)->bool>
-void weak_cc(vertex_t *labels,
-             edge_t const *offsets,
-             vertex_t const *indices,
+void weak_cc(vertex_t* labels,
+             edge_t const* offsets,
+             vertex_t const* indices,
              edge_t nnz,
              vertex_t N,
              cudaStream_t stream,
@@ -315,9 +315,9 @@ void weak_cc(vertex_t *labels,
  * @param stream      Cuda stream to use
  */
 template <typename vertex_t, typename edge_t, int TPB_X = 32>
-void weak_cc_entry(vertex_t *labels,
-                   edge_t const *offsets,
-                   vertex_t const *indices,
+void weak_cc_entry(vertex_t* labels,
+                   edge_t const* offsets,
+                   vertex_t const* indices,
                    edge_t nnz,
                    vertex_t N,
                    cudaStream_t stream)
diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu
new file mode 100644
index 00000000000..da63161da33
--- /dev/null
+++ b/cpp/src/components/weakly_connected_components.cu
@@ -0,0 +1,855 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
+#include <cugraph/prims/vertex_frontier.cuh>
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/shuffle.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+namespace {
+
+// FIXME: this function (after modification) may be useful for SSSP with the near-far method to
+// determine the near-far threshold.
+// add new roots till the sum of the degrees first becomes no smaller than degree_sum_threshold and
+// returns a triplet of (new roots, number of scanned candidates, sum of the degrees of the new
+// roots)
+template <typename GraphViewType>
+std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+           typename GraphViewType::vertex_type,
+           typename GraphViewType::edge_type>
+accumulate_new_roots(raft::handle_t const& handle,
+                     vertex_partition_device_view_t<typename GraphViewType::vertex_type,
+                                                    GraphViewType::is_multi_gpu> vertex_partition,
+                     typename GraphViewType::vertex_type const* components,
+                     typename GraphViewType::edge_type const* degrees,
+                     typename GraphViewType::vertex_type const* candidate_first,
+                     typename GraphViewType::vertex_type const* candidate_last,
+                     typename GraphViewType::vertex_type max_new_roots,
+                     typename GraphViewType::edge_type degree_sum_threshold)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  // tuning parameter (time to scan max_scan_size elements should not take significantly longer than
+  // scanning a single element)
+  vertex_t max_scan_size =
+    static_cast<vertex_t>(handle.get_device_properties().multiProcessorCount) * vertex_t{16384};
+
+  rmm::device_uvector<vertex_t> new_roots(max_new_roots, handle.get_stream_view());
+  vertex_t num_new_roots{0};
+  vertex_t num_scanned{0};
+  edge_t degree_sum{0};
+  while ((candidate_first + num_scanned < candidate_last) && (degree_sum < degree_sum_threshold) &&
+         (num_new_roots < max_new_roots)) {
+    auto scan_size = std::min(
+      max_scan_size,
+      static_cast<vertex_t>(thrust::distance(candidate_first + num_scanned, candidate_last)));
+
+    rmm::device_uvector<vertex_t> tmp_new_roots(scan_size, handle.get_stream_view());
+    rmm::device_uvector<vertex_t> tmp_indices(tmp_new_roots.size(), handle.get_stream_view());
+    auto input_pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      candidate_first + num_scanned, thrust::make_counting_iterator(vertex_t{0})));
+    auto output_pair_first =
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_new_roots.begin(), tmp_indices.begin()));
+    tmp_new_roots.resize(
+      static_cast<vertex_t>(thrust::distance(
+        output_pair_first,
+        thrust::copy_if(
+          rmm::exec_policy(handle.get_stream_view()),
+          input_pair_first,
+          input_pair_first + scan_size,
+          output_pair_first,
+          [vertex_partition, components] __device__(auto pair) {
+            auto v = thrust::get<0>(pair);
+            return (components[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)] ==
+                    invalid_component_id<vertex_t>::value);
+          }))),
+      handle.get_stream_view());
+    tmp_indices.resize(tmp_new_roots.size(), handle.get_stream_view());
+
+    if (tmp_new_roots.size() > 0) {
+      rmm::device_uvector<edge_t> tmp_cumulative_degrees(tmp_new_roots.size(),
+                                                         handle.get_stream_view());
+      thrust::transform(
+        rmm::exec_policy(handle.get_stream_view()),
+        tmp_new_roots.begin(),
+        tmp_new_roots.end(),
+        tmp_cumulative_degrees.begin(),
+        [vertex_partition, degrees] __device__(auto v) {
+          return degrees[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)];
+        });
+      thrust::inclusive_scan(rmm::exec_policy(handle.get_stream_view()),
+                             tmp_cumulative_degrees.begin(),
+                             tmp_cumulative_degrees.end(),
+                             tmp_cumulative_degrees.begin());
+      auto last = thrust::lower_bound(rmm::exec_policy(handle.get_stream_view()),
+                                      tmp_cumulative_degrees.begin(),
+                                      tmp_cumulative_degrees.end(),
+                                      degree_sum_threshold - degree_sum);
+      if (last != tmp_cumulative_degrees.end()) { ++last; }
+      auto tmp_num_new_roots =
+        std::min(static_cast<vertex_t>(thrust::distance(tmp_cumulative_degrees.begin(), last)),
+                 max_new_roots - num_new_roots);
+
+      thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                   tmp_new_roots.begin(),
+                   tmp_new_roots.begin() + tmp_num_new_roots,
+                   new_roots.begin() + num_new_roots);
+      num_new_roots += tmp_num_new_roots;
+      vertex_t tmp_num_scanned{0};
+      edge_t tmp_degree_sum{0};
+      if (tmp_num_new_roots == static_cast<vertex_t>(tmp_new_roots.size())) {
+        tmp_num_scanned = scan_size;
+      } else {
+        raft::update_host(
+          &tmp_num_scanned, tmp_indices.data() + tmp_num_new_roots, size_t{1}, handle.get_stream());
+      }
+      raft::update_host(&tmp_degree_sum,
+                        tmp_cumulative_degrees.data() + (tmp_num_new_roots - 1),
+                        size_t{1},
+                        handle.get_stream());
+      handle.get_stream_view().synchronize();
+      num_scanned += tmp_num_scanned;
+      degree_sum += tmp_degree_sum;
+    } else {
+      num_scanned += scan_size;
+    }
+  }
+
+  new_roots.resize(num_new_roots, handle.get_stream_view());
+  new_roots.shrink_to_fit(handle.get_stream_view());
+
+  return std::make_tuple(std::move(new_roots), num_scanned, degree_sum);
+}
+
+// FIXME: to silence the spurious warning (missing return statement ...) due to the nvcc bug
+// (https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-
+// function-in-constexpr-if-fun)
+template <typename GraphViewType>
+struct v_op_t {
+  using vertex_type = typename GraphViewType::vertex_type;
+
+  vertex_partition_device_view_t<typename GraphViewType::vertex_type, GraphViewType::is_multi_gpu>
+    vertex_partition{};
+  vertex_type* level_components{};
+  decltype(thrust::make_zip_iterator(thrust::make_tuple(
+    static_cast<vertex_type*>(nullptr), static_cast<vertex_type*>(nullptr)))) edge_buffer_first{};
+  // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this requires
+  // placing the atomic barrier on managed memory and this adds additional complication.
+  size_t* num_edge_inserts{};
+  size_t next_bucket_idx{};
+  size_t conflict_bucket_idx{};  // relevant only if GraphViewType::is_multi_gpu is true
+
+  template <bool multi_gpu = GraphViewType::is_multi_gpu>
+  __device__ std::enable_if_t<multi_gpu, thrust::optional<thrust::tuple<size_t, std::byte>>>
+  operator()(thrust::tuple<vertex_type, vertex_type> tagged_v, int v_val /* dummy */) const
+  {
+    auto tag = thrust::get<1>(tagged_v);
+    auto v_offset =
+      vertex_partition.get_local_vertex_offset_from_vertex_nocheck(thrust::get<0>(tagged_v));
+    // FIXME: better switch to atomic_ref after
+    // https://github.com/nvidia/libcudacxx/milestone/2
+    auto old =
+      atomicCAS(level_components + v_offset, invalid_component_id<vertex_type>::value, tag);
+    if (old != invalid_component_id<vertex_type>::value && old != tag) {  // conflict
+      return thrust::optional<thrust::tuple<size_t, std::byte>>{
+        thrust::make_tuple(conflict_bucket_idx, std::byte{0} /* dummy */)};
+    } else {
+      return (old == invalid_component_id<vertex_type>::value)
+               ? thrust::optional<thrust::tuple<size_t, std::byte>>{thrust::make_tuple(
+                   next_bucket_idx, std::byte{0} /* dummy */)}
+               : thrust::nullopt;
+    }
+  }
+
+  template <bool multi_gpu = GraphViewType::is_multi_gpu>
+  __device__ std::enable_if_t<!multi_gpu, thrust::optional<thrust::tuple<size_t, std::byte>>>
+  operator()(thrust::tuple<vertex_type, vertex_type> tagged_v, int v_val /* dummy */) const
+  {
+    return thrust::optional<thrust::tuple<size_t, std::byte>>{
+      thrust::make_tuple(next_bucket_idx, std::byte{0} /* dummy */)};
+  }
+};
+
+template <typename GraphViewType>
+void weakly_connected_components_impl(raft::handle_t const& handle,
+                                      GraphViewType const& push_graph_view,
+                                      typename GraphViewType::vertex_type* components,
+                                      bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  static_assert(std::is_integral<vertex_t>::value,
+                "GraphViewType::vertex_type should be integral.");
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto const num_vertices = push_graph_view.get_number_of_vertices();
+  if (num_vertices == 0) { return; }
+
+  // 1. check input arguments
+
+  CUGRAPH_EXPECTS(
+    push_graph_view.is_symmetric(),
+    "Invalid input argument: input graph should be symmetric for weakly connected components.");
+
+  if (do_expensive_check) {
+    // nothing to do
+  }
+
+  // 2. recursively run multi-root frontier expansion
+
+  enum class Bucket {
+    cur,
+    next,
+    conflict /* relevant only if GraphViewType::is_multi_gpu is true */,
+    num_buckets
+  };
+  // tuning parameter to balance work per iteration (should be large enough to be throughput
+  // bounded) vs # conflicts between frontiers with different roots (# conflicts == # edges for the
+  // next level)
+  auto degree_sum_threshold =
+    static_cast<edge_t>(handle.get_device_properties().multiProcessorCount) * edge_t{1024};
+
+  size_t num_levels{0};
+  graph_t<vertex_t,
+          edge_t,
+          typename GraphViewType::weight_type,
+          GraphViewType::is_adj_matrix_transposed,
+          GraphViewType::is_multi_gpu>
+    level_graph(handle);
+  rmm::device_uvector<vertex_t> level_renumber_map(0, handle.get_stream_view());
+  std::vector<rmm::device_uvector<vertex_t>> level_component_vectors{};
+  // vertex ID in this level to the component ID in the previous level
+  std::vector<rmm::device_uvector<vertex_t>> level_renumber_map_vectors{};
+  std::vector<vertex_t> level_local_vertex_first_vectors{};
+  while (true) {
+    auto level_graph_view = num_levels == 0 ? push_graph_view : level_graph.view();
+    auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+      level_graph_view.get_vertex_partition_view());
+    level_component_vectors.push_back(rmm::device_uvector<vertex_t>(
+      num_levels == 0 ? vertex_t{0} : level_graph_view.get_number_of_local_vertices(),
+      handle.get_stream_view()));
+    level_renumber_map_vectors.push_back(std::move(level_renumber_map));
+    level_local_vertex_first_vectors.push_back(level_graph_view.get_local_vertex_first());
+    auto level_components =
+      num_levels == 0 ? components : level_component_vectors[num_levels].data();
+    ++num_levels;
+    auto degrees = level_graph_view.compute_out_degrees(handle);
+
+    // 2-1. filter out isolated vertices
+
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      thrust::make_counting_iterator(level_graph_view.get_local_vertex_first()), degrees.begin()));
+    thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                      pair_first,
+                      pair_first + level_graph_view.get_number_of_local_vertices(),
+                      level_components,
+                      [] __device__(auto pair) {
+                        auto v      = thrust::get<0>(pair);
+                        auto degree = thrust::get<1>(pair);
+                        return degree > 0 ? invalid_component_id<vertex_t>::value : v;
+                      });
+
+    // 2-2. initialize new root candidates
+
+    // Vertices are first partitioned to high-degree vertices and low-degree vertices, we can reach
+    // degree_sum_threshold with fewer high-degree vertices leading to a higher compression ratio.
+    // The degree threshold is set to ceil(sqrt(degree_sum_threshold * 2)); this guarantees the
+    // compression ratio of at least 50% (ignoring rounding errors) even if all the selected roots
+    // fall into a single connected component as there will be at least as many non-root vertices in
+    // the connected component (assuming there are no multi-edges, if there are multi-edges, we may
+    // not get 50% compression in # vertices but still get compression in # edges). the remaining
+    // low-degree vertices will be randomly shuffled so comparable ratios of vertices will be
+    // selected as roots in the remaining connected components.
+
+    rmm::device_uvector<vertex_t> new_root_candidates(
+      level_graph_view.get_number_of_local_vertices(), handle.get_stream_view());
+    new_root_candidates.resize(
+      thrust::distance(
+        new_root_candidates.begin(),
+        thrust::copy_if(
+          rmm::exec_policy(handle.get_stream_view()),
+          thrust::make_counting_iterator(level_graph_view.get_local_vertex_first()),
+          thrust::make_counting_iterator(level_graph_view.get_local_vertex_last()),
+          new_root_candidates.begin(),
+          [vertex_partition, level_components] __device__(auto v) {
+            return level_components[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(
+                     v)] == invalid_component_id<vertex_t>::value;
+          })),
+      handle.get_stream_view());
+    auto high_degree_partition_last = thrust::stable_partition(
+      rmm::exec_policy(handle.get_stream_view()),
+      new_root_candidates.begin(),
+      new_root_candidates.end(),
+      [vertex_partition,
+       degrees   = degrees.data(),
+       threshold = static_cast<edge_t>(
+         ceil(sqrt(static_cast<double>(degree_sum_threshold) * 2.0)))] __device__(auto v) {
+        return degrees[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)] >=
+               threshold;
+      });
+    thrust::shuffle(rmm::exec_policy(handle.get_stream_view()),
+                    high_degree_partition_last,
+                    new_root_candidates.end(),
+                    thrust::default_random_engine());
+
+    double constexpr max_new_roots_ratio =
+      0.05;  // to avoid selecting all the vertices as roots leading to zero compression
+    static_assert(max_new_roots_ratio > 0.0);
+    auto max_new_roots = std::max(
+      static_cast<vertex_t>(new_root_candidates.size() * max_new_roots_ratio), vertex_t{1});
+
+    auto init_max_new_roots = max_new_roots;
+    if (GraphViewType::is_multi_gpu) {
+      auto& comm           = handle.get_comms();
+      auto const comm_rank = comm.get_rank();
+      auto const comm_size = comm.get_size();
+
+      auto first_candidate_degree = thrust::transform_reduce(
+        rmm::exec_policy(handle.get_stream_view()),
+        new_root_candidates.begin(),
+        new_root_candidates.begin() + (new_root_candidates.size() > 0 ? 1 : 0),
+        [vertex_partition, degrees = degrees.data()] __device__(auto v) {
+          return degrees[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)];
+        },
+        edge_t{0},
+        thrust::plus<edge_t>{});
+
+      auto first_candidate_degrees =
+        host_scalar_gather(comm, first_candidate_degree, int{0}, handle.get_stream());
+      auto new_root_candidate_counts =
+        host_scalar_gather(comm, new_root_candidates.size(), int{0}, handle.get_stream());
+
+      if (comm_rank == 0) {
+        std::vector<vertex_t> init_max_new_root_counts(comm_size, vertex_t{0});
+
+        // if there exists very high degree vertices, we can exceed degree_sum_threshold * comm_size
+        // with fewer than one root per GPU
+        if (std::reduce(first_candidate_degrees.begin(), first_candidate_degrees.end()) >
+            degree_sum_threshold * comm_size) {
+          std::vector<std::tuple<edge_t, int>> degree_gpuid_pairs(comm_size);
+          for (int i = 0; i < comm_size; ++i) {
+            degree_gpuid_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
+          }
+          std::sort(degree_gpuid_pairs.begin(), degree_gpuid_pairs.end(), [](auto lhs, auto rhs) {
+            return std::get<0>(lhs) > std::get<0>(rhs);
+          });
+          edge_t sum{0};
+          for (size_t i = 0; i < degree_gpuid_pairs.size(); ++i) {
+            sum += std::get<0>(degree_gpuid_pairs[i]);
+            init_max_new_root_counts[std::get<1>(degree_gpuid_pairs[i])] = 1;
+            if (sum > degree_sum_threshold * comm_size) { break; }
+          }
+        }
+        // to avoid selecting too many (possibly all) vertices as initial roots leading to no
+        // compression in the worst case.
+        else if (level_graph_view.get_number_of_vertices() <=
+                 static_cast<vertex_t>(handle.get_comms().get_size() *
+                                       ceil(1.0 / max_new_roots_ratio))) {
+          std::vector<int> gpuids{};
+          gpuids.reserve(
+            std::reduce(new_root_candidate_counts.begin(), new_root_candidate_counts.end()));
+          for (size_t i = 0; i < new_root_candidate_counts.size(); ++i) {
+            gpuids.insert(gpuids.end(), new_root_candidate_counts[i], static_cast<int>(i));
+          }
+          std::random_device rd{};
+          std::shuffle(gpuids.begin(), gpuids.end(), std::mt19937(rd()));
+          gpuids.resize(
+            std::max(static_cast<vertex_t>(gpuids.size() * max_new_roots_ratio), vertex_t{1}));
+          for (size_t i = 0; i < gpuids.size(); ++i) {
+            ++init_max_new_root_counts[gpuids[i]];
+          }
+        } else {
+          std::fill(init_max_new_root_counts.begin(),
+                    init_max_new_root_counts.end(),
+                    std::numeric_limits<vertex_t>::max());
+        }
+
+        // FIXME: we need to add host_scalar_scatter
+#if 1
+        rmm::device_uvector<vertex_t> d_counts(comm_size, handle.get_stream_view());
+        raft::update_device(d_counts.data(),
+                            init_max_new_root_counts.data(),
+                            init_max_new_root_counts.size(),
+                            handle.get_stream());
+        device_bcast(
+          comm, d_counts.data(), d_counts.data(), d_counts.size(), int{0}, handle.get_stream());
+        raft::update_host(
+          &init_max_new_roots, d_counts.data() + comm_rank, size_t{1}, handle.get_stream());
+#else
+        init_max_new_roots =
+          host_scalar_scatter(comm, init_max_new_root_counts.data(), int{0}, handle.get_stream());
+#endif
+      } else {
+        // FIXME: we need to add host_scalar_scatter
+#if 1
+        rmm::device_uvector<vertex_t> d_counts(comm_size, handle.get_stream_view());
+        device_bcast(
+          comm, d_counts.data(), d_counts.data(), d_counts.size(), int{0}, handle.get_stream());
+        raft::update_host(
+          &init_max_new_roots, d_counts.data() + comm_rank, size_t{1}, handle.get_stream());
+#else
+        init_max_new_roots =
+          host_scalar_scatter(comm, init_max_new_root_counts.data(), int{0}, handle.get_stream());
+#endif
+      }
+
+      handle.get_stream_view().synchronize();
+      init_max_new_roots = std::min(init_max_new_roots, max_new_roots);
+    }
+
+    // 2-3. initialize vertex frontier, edge_buffer, and col_components (if multi-gpu)
+
+    VertexFrontier<vertex_t,
+                   vertex_t,
+                   GraphViewType::is_multi_gpu,
+                   static_cast<size_t>(Bucket::num_buckets)>
+      vertex_frontier(handle);
+    vertex_t next_candidate_offset{0};
+    edge_t edge_count{0};
+
+    auto edge_buffer =
+      allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(0, handle.get_stream());
+    // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this
+    // requires placing the atomic variable on managed memory and this make it less attractive.
+    rmm::device_scalar<size_t> num_edge_inserts(size_t{0}, handle.get_stream_view());
+
+    rmm::device_uvector<vertex_t> col_components(
+      GraphViewType::is_multi_gpu ? level_graph_view.get_number_of_local_adj_matrix_partition_cols()
+                                  : vertex_t{0},
+      handle.get_stream_view());
+    if (GraphViewType::is_multi_gpu) {
+      thrust::fill(rmm::exec_policy(handle.get_stream_view()),
+                   col_components.begin(),
+                   col_components.end(),
+                   invalid_component_id<vertex_t>::value);
+    }
+
+    // 2.4 iterate till every vertex gets visited
+
+    size_t iter{0};
+    while (true) {
+      if ((edge_count < degree_sum_threshold) &&
+          (next_candidate_offset < static_cast<vertex_t>(new_root_candidates.size()))) {
+        auto [new_roots, num_scanned, degree_sum] = accumulate_new_roots<GraphViewType>(
+          handle,
+          vertex_partition,
+          level_components,
+          degrees.data(),
+          new_root_candidates.data() + next_candidate_offset,
+          new_root_candidates.data() + new_root_candidates.size(),
+          iter == 0 ? init_max_new_roots : max_new_roots,
+          degree_sum_threshold - edge_count);
+        next_candidate_offset += num_scanned;
+        edge_count += degree_sum;
+
+        thrust::sort(
+          rmm::exec_policy(handle.get_stream_view()), new_roots.begin(), new_roots.end());
+
+        thrust::for_each(
+          rmm::exec_policy(handle.get_stream_view()),
+          new_roots.begin(),
+          new_roots.end(),
+          [vertex_partition, components = level_components] __device__(auto c) {
+            components[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(c)] = c;
+          });
+
+        auto pair_first =
+          thrust::make_zip_iterator(thrust::make_tuple(new_roots.begin(), new_roots.begin()));
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur))
+          .insert(pair_first, pair_first + new_roots.size());
+      }
+
+      if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).aggregate_size() == 0) {
+        break;
+      }
+
+      if (GraphViewType::is_multi_gpu) {
+        copy_to_adj_matrix_col(
+          handle,
+          level_graph_view,
+          thrust::get<0>(vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur))
+                           .begin()
+                           .get_iterator_tuple()),
+          thrust::get<0>(vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur))
+                           .end()
+                           .get_iterator_tuple()),
+          level_components,
+          col_components.begin());
+      }
+
+      auto max_pushes =
+        GraphViewType::is_multi_gpu
+          ? compute_num_out_nbrs_from_frontier(
+              handle, level_graph_view, vertex_frontier, static_cast<size_t>(Bucket::cur))
+          : edge_count;
+
+      // FIXME: if we use cuco::static_map (no duplicates, ideally we need static_set), edge_buffer
+      // size cannot exceed (# roots)^2 and we can avoid additional sort & unique (but resizing the
+      // buffer may be more expensive).
+      auto old_num_edge_inserts = num_edge_inserts.value(handle.get_stream_view());
+      resize_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        edge_buffer, old_num_edge_inserts + max_pushes, handle.get_stream());
+
+      update_frontier_v_push_if_out_nbr(
+        handle,
+        level_graph_view,
+        vertex_frontier,
+        static_cast<size_t>(Bucket::cur),
+        GraphViewType::is_multi_gpu ? std::vector<size_t>{static_cast<size_t>(Bucket::next),
+                                                          static_cast<size_t>(Bucket::conflict)}
+                                    : std::vector<size_t>{static_cast<size_t>(Bucket::next)},
+        thrust::make_counting_iterator(0) /* dummy */,
+        thrust::make_counting_iterator(0) /* dummy */,
+        [col_components = GraphViewType::is_multi_gpu ? col_components.data() : level_components,
+         col_first      = level_graph_view.get_local_adj_matrix_partition_col_first(),
+         edge_buffer_first =
+           get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
+         num_edge_inserts = num_edge_inserts.data()] __device__(auto tagged_src,
+                                                                vertex_t dst,
+                                                                auto src_val,
+                                                                auto dst_val) {
+          auto tag        = thrust::get<1>(tagged_src);
+          auto col_offset = dst - col_first;
+          // FIXME: better switch to atomic_ref after
+          // https://github.com/nvidia/libcudacxx/milestone/2
+          auto old =
+            atomicCAS(col_components + col_offset, invalid_component_id<vertex_t>::value, tag);
+          if (old != invalid_component_id<vertex_t>::value && old != tag) {  // conflict
+            static_assert(sizeof(unsigned long long int) == sizeof(size_t));
+            auto edge_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(num_edge_inserts),
+                                      static_cast<unsigned long long int>(1));
+            // keep only the edges in the lower triangular part
+            *(edge_buffer_first + edge_idx) =
+              tag >= old ? thrust::make_tuple(tag, old) : thrust::make_tuple(old, tag);
+          }
+          return (old == invalid_component_id<vertex_t>::value) ? thrust::optional<vertex_t>{tag}
+                                                                : thrust::nullopt;
+        },
+        reduce_op::null(),
+        thrust::make_constant_iterator(0) /* dummy */,
+        thrust::make_discard_iterator() /* dummy */,
+        v_op_t<GraphViewType>{
+          vertex_partition,
+          level_components,
+          get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
+          num_edge_inserts.data(),
+          static_cast<size_t>(Bucket::next),
+          static_cast<size_t>(Bucket::conflict)});
+
+      if (GraphViewType::is_multi_gpu) {
+        auto cur_num_edge_inserts = num_edge_inserts.value(handle.get_stream_view());
+        auto& conflict_bucket = vertex_frontier.get_bucket(static_cast<size_t>(Bucket::conflict));
+        resize_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+          edge_buffer, cur_num_edge_inserts + conflict_bucket.size(), handle.get_stream());
+        thrust::for_each(
+          rmm::exec_policy(handle.get_stream_view()),
+          conflict_bucket.begin(),
+          conflict_bucket.end(),
+          [vertex_partition,
+           level_components,
+           edge_buffer_first =
+             get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
+           num_edge_inserts = num_edge_inserts.data()] __device__(auto tagged_v) {
+            auto v_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(
+              thrust::get<0>(tagged_v));
+            auto old = *(level_components + v_offset);
+            auto tag = thrust::get<1>(tagged_v);
+            static_assert(sizeof(unsigned long long int) == sizeof(size_t));
+            auto edge_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(num_edge_inserts),
+                                      static_cast<unsigned long long int>(1));
+            // keep only the edges in the lower triangular part
+            *(edge_buffer_first + edge_idx) =
+              tag >= old ? thrust::make_tuple(tag, old) : thrust::make_tuple(old, tag);
+          });
+        conflict_bucket.clear();
+      }
+
+      // maintain the list of sorted unique edges (we can avoid this if we use cuco::static_map(no
+      // duplicates, ideally we need static_set)).
+      auto new_num_edge_inserts = num_edge_inserts.value(handle.get_stream_view());
+      if (new_num_edge_inserts > old_num_edge_inserts) {
+        auto edge_first =
+          get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer);
+        thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                     edge_first + old_num_edge_inserts,
+                     edge_first + new_num_edge_inserts);
+        if (old_num_edge_inserts > 0) {
+          auto tmp_edge_buffer = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+            new_num_edge_inserts, handle.get_stream());
+          auto tmp_edge_first =
+            get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(tmp_edge_buffer);
+          thrust::merge(rmm::exec_policy(handle.get_stream_view()),
+                        edge_first,
+                        edge_first + old_num_edge_inserts,
+                        edge_first + old_num_edge_inserts,
+                        edge_first + new_num_edge_inserts,
+                        tmp_edge_first);
+          edge_buffer = std::move(tmp_edge_buffer);
+        }
+        edge_first = get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer);
+        auto unique_edge_last = thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                               edge_first,
+                                               edge_first + new_num_edge_inserts);
+        auto num_unique_edges = static_cast<size_t>(thrust::distance(edge_first, unique_edge_last));
+        num_edge_inserts.set_value_async(num_unique_edges, handle.get_stream_view());
+      }
+
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).clear();
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).shrink_to_fit();
+      vertex_frontier.swap_buckets(static_cast<size_t>(Bucket::cur),
+                                   static_cast<size_t>(Bucket::next));
+      edge_count = thrust::transform_reduce(
+        rmm::exec_policy(handle.get_stream_view()),
+        thrust::get<0>(vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur))
+                         .begin()
+                         .get_iterator_tuple()),
+        thrust::get<0>(
+          vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).end().get_iterator_tuple()),
+        [vertex_partition, degrees = degrees.data()] __device__(auto v) {
+          return degrees[vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)];
+        },
+        edge_t{0},
+        thrust::plus<edge_t>());
+
+      ++iter;
+    }
+
+    // 2-5. construct the next level graph from the edges emitted on conflicts
+
+    auto num_inserts           = num_edge_inserts.value(handle.get_stream_view());
+    auto aggregate_num_inserts = num_inserts;
+    if (GraphViewType::is_multi_gpu) {
+      auto& comm            = handle.get_comms();
+      aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, handle.get_stream());
+    }
+
+    if (aggregate_num_inserts > 0) {
+      resize_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        edge_buffer, static_cast<size_t>(num_inserts * 2), handle.get_stream());
+      auto input_first = get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer);
+      auto output_first = thrust::make_zip_iterator(
+                            thrust::make_tuple(thrust::get<1>(input_first.get_iterator_tuple()),
+                                               thrust::get<0>(input_first.get_iterator_tuple()))) +
+                          num_inserts;
+      thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                   input_first,
+                   input_first + num_inserts,
+                   output_first);
+
+      if (GraphViewType::is_multi_gpu) {
+        auto& comm           = handle.get_comms();
+        auto const comm_size = comm.get_size();
+        auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+        auto const row_comm_size = row_comm.get_size();
+        auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+        auto const col_comm_size = col_comm.get_size();
+
+        std::tie(edge_buffer, std::ignore) =
+          cugraph::experimental::groupby_gpuid_and_shuffle_values(
+            comm,
+            get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
+            get_dataframe_buffer_end<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
+            [key_func =
+               cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+                 comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+              return key_func(thrust::get<0>(val), thrust::get<1>(val));
+            },
+            handle.get_stream());
+        auto edge_first =
+          get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer);
+        auto edge_last = get_dataframe_buffer_end<thrust::tuple<vertex_t, vertex_t>>(edge_buffer);
+        thrust::sort(rmm::exec_policy(handle.get_stream_view()), edge_first, edge_last);
+        auto unique_edge_last =
+          thrust::unique(rmm::exec_policy(handle.get_stream_view()), edge_first, edge_last);
+        resize_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+          edge_buffer,
+          static_cast<size_t>(thrust::distance(edge_first, unique_edge_last)),
+          handle.get_stream());
+        shrink_to_fit_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(edge_buffer,
+                                                                          handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
+      std::tie(level_graph, tmp_renumber_map) =
+        create_graph_from_edgelist<vertex_t,
+                                   edge_t,
+                                   weight_t,
+                                   GraphViewType::is_adj_matrix_transposed,
+                                   GraphViewType::is_multi_gpu>(handle,
+                                                                std::nullopt,
+                                                                std::move(std::get<0>(edge_buffer)),
+                                                                std::move(std::get<1>(edge_buffer)),
+                                                                std::nullopt,
+                                                                graph_properties_t{true, false},
+                                                                true);
+      level_renumber_map = std::move(*tmp_renumber_map);
+    } else {
+      break;
+    }
+  }
+
+  // 3. recursive update the current level component IDs from the next level component IDs
+
+  for (size_t i = 0; i < num_levels - 1; ++i) {
+    size_t next_level    = num_levels - 1 - i;
+    size_t current_level = next_level - 1;
+
+    rmm::device_uvector<vertex_t> next_local_vertices(level_renumber_map_vectors[next_level].size(),
+                                                      handle.get_stream_view());
+    thrust::sequence(rmm::exec_policy(handle.get_stream_view()),
+                     next_local_vertices.begin(),
+                     next_local_vertices.end(),
+                     level_local_vertex_first_vectors[next_level]);
+    relabel<vertex_t, GraphViewType::is_multi_gpu>(
+      handle,
+      std::make_tuple(next_local_vertices.data(), level_renumber_map_vectors[next_level].data()),
+      next_local_vertices.size(),
+      level_component_vectors[next_level].data(),
+      level_component_vectors[next_level].size(),
+      false);
+    relabel<vertex_t, GraphViewType::is_multi_gpu>(
+      handle,
+      std::make_tuple(level_renumber_map_vectors[next_level].data(),
+                      level_component_vectors[next_level].data()),
+      level_renumber_map_vectors[next_level].size(),
+      current_level == 0 ? components : level_component_vectors[current_level].data(),
+      current_level == 0 ? push_graph_view.get_number_of_local_vertices()
+                         : level_component_vectors[current_level].size(),
+      true);
+  }
+}
+
+}  // namespace
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+  vertex_t* components,
+  bool do_expensive_check)
+{
+  weakly_connected_components_impl(handle, graph_view, components, do_expensive_check);
+}
+
+// explicit instantiation
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
+  int32_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
+  int64_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
+  int64_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
+  int64_t* components,
+  bool do_expensive_check);
+
+template void weakly_connected_components(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
+  int64_t* components,
+  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/converters/COOtoCSR.cu b/cpp/src/converters/COOtoCSR.cu
index 787872742e9..49986810539 100644
--- a/cpp/src/converters/COOtoCSR.cu
+++ b/cpp/src/converters/COOtoCSR.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,61 +14,71 @@
  * limitations under the License.
  */
 
-#include <functions.hpp>
+#include <cugraph/functions.hpp>
 #include "COOtoCSR.cuh"
 
 namespace cugraph {
 
 // Explicit instantiation for uint32_t + float
-template std::unique_ptr<GraphCSR<uint32_t, uint32_t, float>> coo_to_csr<uint32_t, uint32_t, float>(
-  GraphCOOView<uint32_t, uint32_t, float> const &graph, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCSR<uint32_t, uint32_t, float>>
+coo_to_csr<uint32_t, uint32_t, float>(legacy::GraphCOOView<uint32_t, uint32_t, float> const& graph,
+                                      rmm::mr::device_memory_resource*);
 
 // Explicit instantiation for uint32_t + double
-template std::unique_ptr<GraphCSR<uint32_t, uint32_t, double>>
-coo_to_csr<uint32_t, uint32_t, double>(GraphCOOView<uint32_t, uint32_t, double> const &graph,
-                                       rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCSR<uint32_t, uint32_t, double>>
+coo_to_csr<uint32_t, uint32_t, double>(
+  legacy::GraphCOOView<uint32_t, uint32_t, double> const& graph, rmm::mr::device_memory_resource*);
 
 // Explicit instantiation for int + float
-template std::unique_ptr<GraphCSR<int32_t, int32_t, float>> coo_to_csr<int32_t, int32_t, float>(
-  GraphCOOView<int32_t, int32_t, float> const &graph, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCSR<int32_t, int32_t, float>>
+coo_to_csr<int32_t, int32_t, float>(legacy::GraphCOOView<int32_t, int32_t, float> const& graph,
+                                    rmm::mr::device_memory_resource*);
 
 // Explicit instantiation for int + double
-template std::unique_ptr<GraphCSR<int32_t, int32_t, double>> coo_to_csr<int32_t, int32_t, double>(
-  GraphCOOView<int32_t, int32_t, double> const &graph, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCSR<int32_t, int32_t, double>>
+coo_to_csr<int32_t, int32_t, double>(legacy::GraphCOOView<int32_t, int32_t, double> const& graph,
+                                     rmm::mr::device_memory_resource*);
 
 // Explicit instantiation for int64_t + float
-template std::unique_ptr<GraphCSR<int64_t, int64_t, float>> coo_to_csr<int64_t, int64_t, float>(
-  GraphCOOView<int64_t, int64_t, float> const &graph, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCSR<int64_t, int64_t, float>>
+coo_to_csr<int64_t, int64_t, float>(legacy::GraphCOOView<int64_t, int64_t, float> const& graph,
+                                    rmm::mr::device_memory_resource*);
 
 // Explicit instantiation for int64_t + double
-template std::unique_ptr<GraphCSR<int64_t, int64_t, double>> coo_to_csr<int64_t, int64_t, double>(
-  GraphCOOView<int64_t, int64_t, double> const &graph, rmm::mr::device_memory_resource *);
+template std::unique_ptr<legacy::GraphCSR<int64_t, int64_t, double>>
+coo_to_csr<int64_t, int64_t, double>(legacy::GraphCOOView<int64_t, int64_t, double> const& graph,
+                                     rmm::mr::device_memory_resource*);
 
 // in-place versions:
 //
 // Explicit instantiation for uint32_t + float
 template void coo_to_csr_inplace<uint32_t, uint32_t, float>(
-  GraphCOOView<uint32_t, uint32_t, float> &graph, GraphCSRView<uint32_t, uint32_t, float> &result);
+  legacy::GraphCOOView<uint32_t, uint32_t, float>& graph,
+  legacy::GraphCSRView<uint32_t, uint32_t, float>& result);
 
 // Explicit instantiation for uint32_t + double
 template void coo_to_csr_inplace<uint32_t, uint32_t, double>(
-  GraphCOOView<uint32_t, uint32_t, double> &graph,
-  GraphCSRView<uint32_t, uint32_t, double> &result);
+  legacy::GraphCOOView<uint32_t, uint32_t, double>& graph,
+  legacy::GraphCSRView<uint32_t, uint32_t, double>& result);
 
 // Explicit instantiation for int + float
 template void coo_to_csr_inplace<int32_t, int32_t, float>(
-  GraphCOOView<int32_t, int32_t, float> &graph, GraphCSRView<int32_t, int32_t, float> &result);
+  legacy::GraphCOOView<int32_t, int32_t, float>& graph,
+  legacy::GraphCSRView<int32_t, int32_t, float>& result);
 
 // Explicit instantiation for int + double
 template void coo_to_csr_inplace<int32_t, int32_t, double>(
-  GraphCOOView<int32_t, int32_t, double> &graph, GraphCSRView<int32_t, int32_t, double> &result);
+  legacy::GraphCOOView<int32_t, int32_t, double>& graph,
+  legacy::GraphCSRView<int32_t, int32_t, double>& result);
 
 // Explicit instantiation for int64_t + float
 template void coo_to_csr_inplace<int64_t, int64_t, float>(
-  GraphCOOView<int64_t, int64_t, float> &graph, GraphCSRView<int64_t, int64_t, float> &result);
+  legacy::GraphCOOView<int64_t, int64_t, float>& graph,
+  legacy::GraphCSRView<int64_t, int64_t, float>& result);
 
 // Explicit instantiation for int64_t + double
 template void coo_to_csr_inplace<int64_t, int64_t, double>(
-  GraphCOOView<int64_t, int64_t, double> &graph, GraphCSRView<int64_t, int64_t, double> &result);
+  legacy::GraphCOOView<int64_t, int64_t, double>& graph,
+  legacy::GraphCSRView<int64_t, int64_t, double>& result);
 
 }  // namespace cugraph
diff --git a/cpp/src/converters/COOtoCSR.cuh b/cpp/src/converters/COOtoCSR.cuh
index b110e02a513..641b037efdd 100644
--- a/cpp/src/converters/COOtoCSR.cuh
+++ b/cpp/src/converters/COOtoCSR.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,15 +30,15 @@
 #include <thrust/tuple.h>
 #include <algorithm>
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_run_length_encode.cuh>
 
-#include <functions.hpp>
+#include <cugraph/functions.hpp>
 
-#include <graph.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 namespace cugraph {
 namespace detail {
@@ -55,38 +55,38 @@ namespace detail {
  * @tparam WT              Type of edge weights. Supported value : float or double.
  *
  * @param[in] graph        The input graph object
- * @param[in] stream       The cuda stream for kernel calls
+ * @param[in] stream_view  The cuda stream for kernel calls
  *
  * @param[out] result      Total number of vertices
  */
 template <typename VT, typename ET, typename WT>
-VT sort(GraphCOOView<VT, ET, WT> &graph, cudaStream_t stream)
+VT sort(legacy::GraphCOOView<VT, ET, WT>& graph, rmm::cuda_stream_view stream_view)
 {
   VT max_src_id;
   VT max_dst_id;
   if (graph.has_data()) {
     thrust::stable_sort_by_key(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream_view),
       graph.dst_indices,
       graph.dst_indices + graph.number_of_edges,
       thrust::make_zip_iterator(thrust::make_tuple(graph.src_indices, graph.edge_data)));
     CUDA_TRY(cudaMemcpy(
       &max_dst_id, &(graph.dst_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
     thrust::stable_sort_by_key(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream_view),
       graph.src_indices,
       graph.src_indices + graph.number_of_edges,
       thrust::make_zip_iterator(thrust::make_tuple(graph.dst_indices, graph.edge_data)));
     CUDA_TRY(cudaMemcpy(
       &max_src_id, &(graph.src_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
   } else {
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::stable_sort_by_key(rmm::exec_policy(stream_view),
                                graph.dst_indices,
                                graph.dst_indices + graph.number_of_edges,
                                graph.src_indices);
     CUDA_TRY(cudaMemcpy(
       &max_dst_id, &(graph.dst_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault));
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::stable_sort_by_key(rmm::exec_policy(stream_view),
                                graph.src_indices,
                                graph.src_indices + graph.number_of_edges,
                                graph.dst_indices);
@@ -97,14 +97,15 @@ VT sort(GraphCOOView<VT, ET, WT> &graph, cudaStream_t stream)
 }
 
 template <typename VT, typename ET>
-void fill_offset(
-  VT *source, ET *offsets, VT number_of_vertices, ET number_of_edges, cudaStream_t stream)
+void fill_offset(VT* source,
+                 ET* offsets,
+                 VT number_of_vertices,
+                 ET number_of_edges,
+                 rmm::cuda_stream_view stream_view)
 {
-  thrust::fill(rmm::exec_policy(stream)->on(stream),
-               offsets,
-               offsets + number_of_vertices + 1,
-               number_of_edges);
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::fill(
+    rmm::exec_policy(stream_view), offsets, offsets + number_of_vertices + 1, number_of_edges);
+  thrust::for_each(rmm::exec_policy(stream_view),
                    thrust::make_counting_iterator<ET>(1),
                    thrust::make_counting_iterator<ET>(number_of_edges),
                    [source, offsets] __device__(ET index) {
@@ -116,7 +117,7 @@ void fill_offset(
   off[src[0]]                = ET{0};
 
   auto iter = thrust::make_reverse_iterator(offsets + number_of_vertices + 1);
-  thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::inclusive_scan(rmm::exec_policy(stream_view),
                          iter,
                          iter + number_of_vertices + 1,
                          iter,
@@ -124,18 +125,18 @@ void fill_offset(
 }
 
 template <typename VT, typename ET>
-rmm::device_buffer create_offset(VT *source,
+rmm::device_buffer create_offset(VT* source,
                                  VT number_of_vertices,
                                  ET number_of_edges,
-                                 cudaStream_t stream,
-                                 rmm::mr::device_memory_resource *mr)
+                                 rmm::cuda_stream_view stream_view,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // Offset array needs an extra element at the end to contain the ending offsets
   // of the last vertex
-  rmm::device_buffer offsets_buffer(sizeof(ET) * (number_of_vertices + 1), stream, mr);
-  ET *offsets = static_cast<ET *>(offsets_buffer.data());
+  rmm::device_buffer offsets_buffer(sizeof(ET) * (number_of_vertices + 1), stream_view, mr);
+  ET* offsets = static_cast<ET*>(offsets_buffer.data());
 
-  fill_offset(source, offsets, number_of_vertices, number_of_edges, stream);
+  fill_offset(source, offsets, number_of_vertices, number_of_edges, stream_view);
 
   return offsets_buffer;
 }
@@ -143,35 +144,39 @@ rmm::device_buffer create_offset(VT *source,
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCSR<VT, ET, WT>> coo_to_csr(GraphCOOView<VT, ET, WT> const &graph,
-                                                 rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCSR<VT, ET, WT>> coo_to_csr(
+  legacy::GraphCOOView<VT, ET, WT> const& graph, rmm::mr::device_memory_resource* mr)
 {
-  cudaStream_t stream{nullptr};
+  rmm::cuda_stream_view stream_view;
 
-  GraphCOO<VT, ET, WT> temp_graph(graph, stream, mr);
-  GraphCOOView<VT, ET, WT> temp_graph_view = temp_graph.view();
-  VT total_vertex_count                    = detail::sort(temp_graph_view, stream);
-  rmm::device_buffer offsets               = detail::create_offset(
-    temp_graph.src_indices(), total_vertex_count, temp_graph.number_of_edges(), stream, mr);
+  legacy::GraphCOO<VT, ET, WT> temp_graph(graph, stream_view.value(), mr);
+  legacy::GraphCOOView<VT, ET, WT> temp_graph_view = temp_graph.view();
+  VT total_vertex_count                            = detail::sort(temp_graph_view, stream_view);
+  rmm::device_buffer offsets                       = detail::create_offset(
+    temp_graph.src_indices(), total_vertex_count, temp_graph.number_of_edges(), stream_view, mr);
   auto coo_contents = temp_graph.release();
-  GraphSparseContents<VT, ET, WT> csr_contents{
+  legacy::GraphSparseContents<VT, ET, WT> csr_contents{
     total_vertex_count,
     coo_contents.number_of_edges,
     std::make_unique<rmm::device_buffer>(std::move(offsets)),
     std::move(coo_contents.dst_indices),
     std::move(coo_contents.edge_data)};
 
-  return std::make_unique<GraphCSR<VT, ET, WT>>(std::move(csr_contents));
+  return std::make_unique<legacy::GraphCSR<VT, ET, WT>>(std::move(csr_contents));
 }
 
 template <typename VT, typename ET, typename WT>
-void coo_to_csr_inplace(GraphCOOView<VT, ET, WT> &graph, GraphCSRView<VT, ET, WT> &result)
+void coo_to_csr_inplace(legacy::GraphCOOView<VT, ET, WT>& graph,
+                        legacy::GraphCSRView<VT, ET, WT>& result)
 {
-  cudaStream_t stream{nullptr};
+  rmm::cuda_stream_view stream_view;
 
-  detail::sort(graph, stream);
-  detail::fill_offset(
-    graph.src_indices, result.offsets, graph.number_of_vertices, graph.number_of_edges, stream);
+  detail::sort(graph, stream_view);
+  detail::fill_offset(graph.src_indices,
+                      result.offsets,
+                      graph.number_of_vertices,
+                      graph.number_of_edges,
+                      stream_view);
 
   CUDA_TRY(cudaMemcpy(
     result.indices, graph.dst_indices, sizeof(VT) * graph.number_of_edges, cudaMemcpyDefault));
@@ -184,60 +189,65 @@ void coo_to_csr_inplace(GraphCOOView<VT, ET, WT> &graph, GraphCSRView<VT, ET, WT
 // to attempt decrease in compile time:
 //
 // EIDecl for uint32_t + float
-extern template std::unique_ptr<GraphCSR<uint32_t, uint32_t, float>>
-coo_to_csr<uint32_t, uint32_t, float>(GraphCOOView<uint32_t, uint32_t, float> const &graph,
-                                      rmm::mr::device_memory_resource *);
+extern template std::unique_ptr<legacy::GraphCSR<uint32_t, uint32_t, float>>
+coo_to_csr<uint32_t, uint32_t, float>(legacy::GraphCOOView<uint32_t, uint32_t, float> const& graph,
+                                      rmm::mr::device_memory_resource*);
 
 // EIDecl for uint32_t + double
-extern template std::unique_ptr<GraphCSR<uint32_t, uint32_t, double>>
-coo_to_csr<uint32_t, uint32_t, double>(GraphCOOView<uint32_t, uint32_t, double> const &graph,
-                                       rmm::mr::device_memory_resource *);
+extern template std::unique_ptr<legacy::GraphCSR<uint32_t, uint32_t, double>>
+coo_to_csr<uint32_t, uint32_t, double>(
+  legacy::GraphCOOView<uint32_t, uint32_t, double> const& graph, rmm::mr::device_memory_resource*);
 
 // EIDecl for int + float
-extern template std::unique_ptr<GraphCSR<int32_t, int32_t, float>>
-coo_to_csr<int32_t, int32_t, float>(GraphCOOView<int32_t, int32_t, float> const &graph,
-                                    rmm::mr::device_memory_resource *);
+extern template std::unique_ptr<legacy::GraphCSR<int32_t, int32_t, float>>
+coo_to_csr<int32_t, int32_t, float>(legacy::GraphCOOView<int32_t, int32_t, float> const& graph,
+                                    rmm::mr::device_memory_resource*);
 
 // EIDecl for int + double
-extern template std::unique_ptr<GraphCSR<int32_t, int32_t, double>>
-coo_to_csr<int32_t, int32_t, double>(GraphCOOView<int32_t, int32_t, double> const &graph,
-                                     rmm::mr::device_memory_resource *);
+extern template std::unique_ptr<legacy::GraphCSR<int32_t, int32_t, double>>
+coo_to_csr<int32_t, int32_t, double>(legacy::GraphCOOView<int32_t, int32_t, double> const& graph,
+                                     rmm::mr::device_memory_resource*);
 
 // EIDecl for int64_t + float
-extern template std::unique_ptr<GraphCSR<int64_t, int64_t, float>>
-coo_to_csr<int64_t, int64_t, float>(GraphCOOView<int64_t, int64_t, float> const &graph,
-                                    rmm::mr::device_memory_resource *);
+extern template std::unique_ptr<legacy::GraphCSR<int64_t, int64_t, float>>
+coo_to_csr<int64_t, int64_t, float>(legacy::GraphCOOView<int64_t, int64_t, float> const& graph,
+                                    rmm::mr::device_memory_resource*);
 
 // EIDecl for int64_t + double
-extern template std::unique_ptr<GraphCSR<int64_t, int64_t, double>>
-coo_to_csr<int64_t, int64_t, double>(GraphCOOView<int64_t, int64_t, double> const &graph,
-                                     rmm::mr::device_memory_resource *);
+extern template std::unique_ptr<legacy::GraphCSR<int64_t, int64_t, double>>
+coo_to_csr<int64_t, int64_t, double>(legacy::GraphCOOView<int64_t, int64_t, double> const& graph,
+                                     rmm::mr::device_memory_resource*);
 
 // in-place versions:
 //
 // EIDecl for uint32_t + float
 extern template void coo_to_csr_inplace<uint32_t, uint32_t, float>(
-  GraphCOOView<uint32_t, uint32_t, float> &graph, GraphCSRView<uint32_t, uint32_t, float> &result);
+  legacy::GraphCOOView<uint32_t, uint32_t, float>& graph,
+  legacy::GraphCSRView<uint32_t, uint32_t, float>& result);
 
 // EIDecl for uint32_t + double
 extern template void coo_to_csr_inplace<uint32_t, uint32_t, double>(
-  GraphCOOView<uint32_t, uint32_t, double> &graph,
-  GraphCSRView<uint32_t, uint32_t, double> &result);
+  legacy::GraphCOOView<uint32_t, uint32_t, double>& graph,
+  legacy::GraphCSRView<uint32_t, uint32_t, double>& result);
 
 // EIDecl for int + float
 extern template void coo_to_csr_inplace<int32_t, int32_t, float>(
-  GraphCOOView<int32_t, int32_t, float> &graph, GraphCSRView<int32_t, int32_t, float> &result);
+  legacy::GraphCOOView<int32_t, int32_t, float>& graph,
+  legacy::GraphCSRView<int32_t, int32_t, float>& result);
 
 // EIDecl for int + double
 extern template void coo_to_csr_inplace<int32_t, int32_t, double>(
-  GraphCOOView<int32_t, int32_t, double> &graph, GraphCSRView<int32_t, int32_t, double> &result);
+  legacy::GraphCOOView<int32_t, int32_t, double>& graph,
+  legacy::GraphCSRView<int32_t, int32_t, double>& result);
 
 // EIDecl for int64_t + float
 extern template void coo_to_csr_inplace<int64_t, int64_t, float>(
-  GraphCOOView<int64_t, int64_t, float> &graph, GraphCSRView<int64_t, int64_t, float> &result);
+  legacy::GraphCOOView<int64_t, int64_t, float>& graph,
+  legacy::GraphCSRView<int64_t, int64_t, float>& result);
 
 // EIDecl for int64_t + double
 extern template void coo_to_csr_inplace<int64_t, int64_t, double>(
-  GraphCOOView<int64_t, int64_t, double> &graph, GraphCSRView<int64_t, int64_t, double> &result);
+  legacy::GraphCOOView<int64_t, int64_t, double>& graph,
+  legacy::GraphCSRView<int64_t, int64_t, double>& result);
 
 }  // namespace cugraph
diff --git a/cpp/src/converters/permute_graph.cuh b/cpp/src/converters/permute_graph.cuh
index b5b2de83e9b..024dfc2f3a7 100644
--- a/cpp/src/converters/permute_graph.cuh
+++ b/cpp/src/converters/permute_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 #include <rmm/thrust_rmm_allocator.h>
-#include <graph.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
 #include "converters/COOtoCSR.cuh"
-#include "utilities/graph_utils.cuh"
 
 namespace cugraph {
 namespace detail {
 
 template <typename IdxT>
 struct permutation_functor {
-  IdxT const *permutation;
-  permutation_functor(IdxT const *p) : permutation(p) {}
+  IdxT const* permutation;
+  permutation_functor(IdxT const* p) : permutation(p) {}
   __host__ __device__ IdxT operator()(IdxT in) const { return permutation[in]; }
 };
 
@@ -42,9 +42,9 @@ struct permutation_functor {
  * @return The permuted graph.
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-void permute_graph(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                   vertex_t const *permutation,
-                   GraphCSRView<vertex_t, edge_t, weight_t> result,
+void permute_graph(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                   vertex_t const* permutation,
+                   legacy::GraphCSRView<vertex_t, edge_t, weight_t> result,
                    cudaStream_t stream = 0)
 {
   //  Create a COO out of the CSR
@@ -52,9 +52,9 @@ void permute_graph(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
   rmm::device_vector<vertex_t> dst_vertices_v(graph.number_of_edges);
   rmm::device_vector<weight_t> weights_v(graph.number_of_edges);
 
-  vertex_t *d_src     = src_vertices_v.data().get();
-  vertex_t *d_dst     = dst_vertices_v.data().get();
-  weight_t *d_weights = weights_v.data().get();
+  vertex_t* d_src     = src_vertices_v.data().get();
+  vertex_t* d_dst     = dst_vertices_v.data().get();
+  weight_t* d_weights = weights_v.data().get();
 
   graph.get_source_indices(d_src);
 
@@ -76,7 +76,7 @@ void permute_graph(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
                     d_dst,
                     pf);
 
-  GraphCOOView<vertex_t, edge_t, weight_t> graph_coo;
+  legacy::GraphCOOView<vertex_t, edge_t, weight_t> graph_coo;
 
   graph_coo.number_of_vertices = graph.number_of_vertices;
   graph_coo.number_of_edges    = graph.number_of_edges;
diff --git a/cpp/src/converters/renumber.cu b/cpp/src/converters/renumber.cu
deleted file mode 100644
index 9aedbc70e8b..00000000000
--- a/cpp/src/converters/renumber.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "renumber.cuh"
-
-namespace cugraph {
-
-template <typename VT_IN, typename VT_OUT, typename ET>
-std::unique_ptr<rmm::device_buffer> renumber_vertices(
-  ET number_of_edges,
-  VT_IN const *src,
-  VT_IN const *dst,
-  VT_OUT *src_renumbered,
-  VT_OUT *dst_renumbered,
-  ET *map_size,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
-
-{
-  //
-  //  For now, let's just specify a default value of the hash size.
-  //  This should be configurable.
-  //
-  //  FIXME:  cudf has a hash table implementation (moving to cuCollections)
-  //          that is dynamic.  We should use it instead, it will be faster
-  //          and dynamically adjust to data sizes.
-  //
-  int hash_size = 8191;
-
-  return cugraph::detail::renumber_vertices(number_of_edges,
-                                            src,
-                                            dst,
-                                            src_renumbered,
-                                            dst_renumbered,
-                                            map_size,
-                                            cugraph::detail::HashFunctionObjectInt(hash_size),
-                                            thrust::less<int32_t>(),
-                                            mr);
-}
-
-template std::unique_ptr<rmm::device_buffer> renumber_vertices(int32_t,
-                                                               int64_t const *,
-                                                               int64_t const *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               rmm::mr::device_memory_resource *);
-template std::unique_ptr<rmm::device_buffer> renumber_vertices(int32_t,
-                                                               int32_t const *,
-                                                               int32_t const *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               rmm::mr::device_memory_resource *);
-
-}  // namespace cugraph
diff --git a/cpp/src/converters/renumber.cuh b/cpp/src/converters/renumber.cuh
deleted file mode 100644
index 263d7199c10..00000000000
--- a/cpp/src/converters/renumber.cuh
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#define CUB_STDERR
-
-#include <chrono>
-
-#include <cub/cub.cuh>
-
-#include <cuda_runtime_api.h>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-
-#include <raft/cudart_utils.h>
-#include <rmm/device_buffer.hpp>
-
-#include <utilities/error.hpp>
-#include "sort/bitonic.cuh"
-#include "utilities/graph_utils.cuh"
-
-namespace cugraph {
-namespace detail {
-
-namespace renumber {
-typedef uint32_t hash_type;
-typedef uint32_t index_type;
-}  // namespace renumber
-
-class HashFunctionObjectInt {
- public:
-  HashFunctionObjectInt(renumber::hash_type hash_size) : hash_size_(hash_size) {}
-
-  template <typename VertexIdType>
-  __device__ __inline__ renumber::hash_type operator()(const VertexIdType &vertex_id) const
-  {
-    return ((vertex_id % hash_size_) + hash_size_) % hash_size_;
-  }
-
-  renumber::hash_type getHashSize() const { return hash_size_; }
-
- private:
-  renumber::hash_type hash_size_;
-};
-
-/**
- * @brief Renumber vertices to a dense numbering (0..vertex_size-1)
- *
- *    This is a templated function so it can take 32 or 64 bit integers.  The
- *    intention is to take source and destination vertex ids that might be
- *    sparsely scattered across the range and push things down to a dense
- *    numbering.
- *
- *    Arrays src, dst, src_renumbered, dst_renumbered and numbering_map are
- *    assumed to be pre-allocated.  numbering_map is best safely allocated
- *    to store 2 * size vertices.
- *
- * @param[in]  size                 Number of edges
- * @param[in]  src                  List of source vertices
- * @param[in]  dst                  List of dest vertices
- * @param[out] src_renumbered       List of source vertices, renumbered
- * @param[out] dst_renumbered       List of dest vertices, renumbered
- * @param[out] vertex_size          Number of unique vertices
- * @param[out] numbering_map        Map of new vertex id to original vertex id. numbering_map[newId]
- * = oldId
- *
- */
-template <typename T_size, typename T_in, typename T_out, typename Hash_t, typename Compare_t>
-std::unique_ptr<rmm::device_buffer> renumber_vertices(T_size size,
-                                                      const T_in *src,
-                                                      const T_in *dst,
-                                                      T_out *src_renumbered,
-                                                      T_out *dst_renumbered,
-                                                      T_size *map_size,
-                                                      Hash_t hash,
-                                                      Compare_t compare,
-                                                      rmm::mr::device_memory_resource *mr)
-{
-  //
-  // This function will allocate numbering_map to be the exact size needed
-  // (user doesn't know a priori how many unique vertices there are.
-  //
-  // Here's the idea: Create a hash table. Since we're dealing with integers,
-  // we can take the integer modulo some prime p to create hash buckets.  Then
-  // we dedupe the hash buckets to create a deduped set of entries.  This hash
-  // table can then be used to renumber everything.
-  //
-  // We need 2 arrays for hash indexes, and one array for data
-  //
-  cudaStream_t stream = nullptr;
-
-  renumber::hash_type hash_size = hash.getHashSize();
-
-  rmm::device_vector<T_in> hash_data_v(2 * size);
-  rmm::device_vector<renumber::index_type> hash_bins_start_v(1 + hash_size,
-                                                             renumber::index_type{0});
-  rmm::device_vector<renumber::index_type> hash_bins_end_v(1 + hash_size);
-
-  T_in *hash_data                       = hash_data_v.data().get();
-  renumber::index_type *hash_bins_start = hash_bins_start_v.data().get();
-  renumber::index_type *hash_bins_end   = hash_bins_end_v.data().get();
-
-  //
-  //  Pass 1: count how many vertex ids end up in each hash bin
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   src,
-                   src + size,
-                   [hash_bins_start, hash] __device__(T_in vid) {
-                     atomicAdd(hash_bins_start + hash(vid), renumber::index_type{1});
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   dst,
-                   dst + size,
-                   [hash_bins_start, hash] __device__(T_in vid) {
-                     atomicAdd(hash_bins_start + hash(vid), renumber::index_type{1});
-                   });
-
-  //
-  //  Compute exclusive sum and copy it into both hash_bins_start and
-  //  hash_bins_end.  hash_bins_end will be used to populate the
-  //  hash_data array and at the end will identify the end of
-  //  each range.
-  //
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
-                         hash_bins_start,
-                         hash_bins_start + hash_size + 1,
-                         hash_bins_end);
-
-  CUDA_TRY(cudaMemcpy(hash_bins_start,
-                      hash_bins_end,
-                      (hash_size + 1) * sizeof(renumber::hash_type),
-                      cudaMemcpyDeviceToDevice));
-
-  //
-  //  Pass 2: Populate hash_data with data from the hash bins.
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   src,
-                   src + size,
-                   [hash_bins_end, hash_data, hash] __device__(T_in vid) {
-                     uint32_t hash_index              = hash(vid);
-                     renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1);
-                     hash_data[hash_offset]           = vid;
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   dst,
-                   dst + size,
-                   [hash_bins_end, hash_data, hash] __device__(T_in vid) {
-                     uint32_t hash_index              = hash(vid);
-                     renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1);
-                     hash_data[hash_offset]           = vid;
-                   });
-
-  //
-  //  Now that we have data in hash bins, we'll do a segmented sort of the has bins
-  //  to sort each bin.  This will allow us to identify duplicates (all duplicates
-  //  are in the same hash bin so they will end up sorted consecutively).
-  //
-  renumber::index_type size_as_int = size;
-  cugraph::sort::bitonic::segmented_sort(
-    hash_size, size_as_int, hash_bins_start, hash_bins_end, hash_data, compare, stream);
-
-  //
-  //  Now we rinse and repeat.  hash_data contains the data organized into sorted
-  //  hash bins.  This allows us to identify duplicates.  We'll start over but
-  //  we'll skip the duplicates when we repopulate the hash table.
-  //
-
-  //
-  //  Pass 3: count how many vertex ids end up in each hash bin after deduping
-  //
-  CUDA_TRY(cudaMemset(hash_bins_start, 0, (1 + hash_size) * sizeof(renumber::index_type)));
-
-  thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
-    thrust::make_counting_iterator<renumber::index_type>(0),
-    thrust::make_counting_iterator<renumber::index_type>(2 * size),
-    [hash_data, hash_bins_start, hash, compare, size] __device__(renumber::index_type idx) {
-      //
-      //     Two items (a and b) are equal if
-      //   compare(a,b) is false and compare(b,a)
-      //   is also false.  If either is true then
-      //   a and b are not equal.
-      //
-      //     Note that if there are k duplicate
-      //   instances of an entry, only the LAST
-      //   entry will be counted
-      //
-      bool unique = ((idx + 1) == (2 * size)) || compare(hash_data[idx], hash_data[idx + 1]) ||
-                    compare(hash_data[idx + 1], hash_data[idx]);
-
-      if (unique) atomicAdd(hash_bins_start + hash(hash_data[idx]), renumber::index_type{1});
-    });
-
-  //
-  //  Compute exclusive sum and copy it into both hash_bins_start and
-  //  hash bins end.
-  //
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
-                         hash_bins_start,
-                         hash_bins_start + hash_size + 1,
-                         hash_bins_end);
-
-  CUDA_TRY(cudaMemcpy(hash_bins_start,
-                      hash_bins_end,
-                      (hash_size + 1) * sizeof(renumber::hash_type),
-                      cudaMemcpyDeviceToDevice));
-
-  //
-  //    The last entry in the array (hash_bins_end[hash_size]) is the
-  //  total number of unique vertices
-  //
-  renumber::index_type temp = 0;
-  CUDA_TRY(cudaMemcpy(
-    &temp, hash_bins_end + hash_size, sizeof(renumber::index_type), cudaMemcpyDeviceToHost));
-  *map_size = temp;
-
-  rmm::device_buffer numbering_map(temp * sizeof(T_in), stream, mr);
-  T_in *local_numbering_map = static_cast<T_in *>(numbering_map.data());
-
-  //
-  //  Pass 4: Populate hash_data with data from the hash bins after deduping
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<renumber::index_type>(0),
-                   thrust::make_counting_iterator<renumber::index_type>(2 * size),
-                   [hash_bins_end, hash_data, local_numbering_map, hash, compare, size] __device__(
-                     renumber::index_type idx) {
-                     bool unique = ((idx + 1) == (2 * size)) ||
-                                   compare(hash_data[idx], hash_data[idx + 1]) ||
-                                   compare(hash_data[idx + 1], hash_data[idx]);
-
-                     if (unique) {
-                       uint32_t hash_index              = hash(hash_data[idx]);
-                       renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1);
-                       local_numbering_map[hash_offset] = hash_data[idx];
-                     }
-                   });
-
-  //
-  //  At this point, hash_bins_start and numbering_map partition the
-  //  unique data into a hash table.
-  //
-
-  //
-  //  If we do a segmented sort now, we can do the final lookups.
-  //
-  size_as_int = size;
-  cugraph::sort::bitonic::segmented_sort(
-    hash_size, size_as_int, hash_bins_start, hash_bins_end, local_numbering_map, compare, stream);
-
-  //
-  //     Renumber the input.  For each vertex, identify the
-  //   hash bin, and then search the hash bin for the
-  //   record that matches, the relative offset between that
-  //   element and the beginning of the array is the vertex
-  //   id in the renumbered map.
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<renumber::index_type>(0),
-                   thrust::make_counting_iterator<renumber::index_type>(size),
-                   [local_numbering_map,
-                    hash_bins_start,
-                    hash_bins_end,
-                    hash,
-                    src,
-                    src_renumbered,
-                    compare] __device__(renumber::index_type idx) {
-                     renumber::hash_type tmp = hash(src[idx]);
-                     const T_in *id =
-                       thrust::lower_bound(thrust::seq,
-                                           local_numbering_map + hash_bins_start[tmp],
-                                           local_numbering_map + hash_bins_end[tmp],
-                                           src[idx],
-                                           compare);
-                     src_renumbered[idx] = id - local_numbering_map;
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<renumber::index_type>(0),
-                   thrust::make_counting_iterator<renumber::index_type>(size),
-                   [local_numbering_map,
-                    hash_bins_start,
-                    hash_bins_end,
-                    hash,
-                    dst,
-                    dst_renumbered,
-                    compare] __device__(renumber::index_type idx) {
-                     renumber::hash_type tmp = hash(dst[idx]);
-                     const T_in *id =
-                       thrust::lower_bound(thrust::seq,
-                                           local_numbering_map + hash_bins_start[tmp],
-                                           local_numbering_map + hash_bins_end[tmp],
-                                           dst[idx],
-                                           compare);
-                     dst_renumbered[idx] = id - local_numbering_map;
-                   });
-
-  return std::make_unique<rmm::device_buffer>(std::move(numbering_map));
-}
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/cores/core_number.cu b/cpp/src/cores/core_number.cu
index cd2b928a81e..b23e7a25405 100644
--- a/cpp/src/cores/core_number.cu
+++ b/cpp/src/cores/core_number.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,15 +17,15 @@
 #include <rmm/thrust_rmm_allocator.h>
 #include <Hornet.hpp>
 #include <Static/CoreNumber/CoreNumber.cuh>
-#include <graph.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 //#include <nvgraph_gdf.h>
 
 namespace cugraph {
 namespace detail {
 
 template <typename VT, typename ET, typename WT>
-void core_number(GraphCSRView<VT, ET, WT> const &graph, int *core_number)
+void core_number(legacy::GraphCSRView<VT, ET, WT> const& graph, int* core_number)
 {
   using HornetGraph = hornet::gpu::HornetStatic<int>;
   using HornetInit  = hornet::HornetInit<VT>;
@@ -38,9 +38,9 @@ void core_number(GraphCSRView<VT, ET, WT> const &graph, int *core_number)
 
 struct FilterEdges {
   int k;
-  int *core_number;
+  int* core_number;
 
-  FilterEdges(int _k, int *d_core_num) : k(_k), core_number(d_core_num) {}
+  FilterEdges(int _k, int* d_core_num) : k(_k), core_number(d_core_num) {}
 
   template <typename T>
   __host__ __device__ bool operator()(T t)
@@ -52,9 +52,9 @@ struct FilterEdges {
 };
 
 template <typename VT, typename ET, typename WT>
-void extract_edges(GraphCOOView<VT, ET, WT> const &i_graph,
-                   GraphCOOView<VT, ET, WT> &o_graph,
-                   VT *d_core,
+void extract_edges(legacy::GraphCOOView<VT, ET, WT> const& i_graph,
+                   legacy::GraphCOOView<VT, ET, WT>& o_graph,
+                   VT* d_core,
                    int k)
 {
   cudaStream_t stream{nullptr};
@@ -96,13 +96,13 @@ void extract_edges(GraphCOOView<VT, ET, WT> const &i_graph,
 // i.e. All edges (s,d,w) in in_graph are copied over to out_graph
 // if core_num[s] and core_num[d] are greater than or equal to k.
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph(
-  GraphCOOView<VT, ET, WT> const &in_graph,
-  int const *vid,
-  int const *core_num,
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> extract_subgraph(
+  legacy::GraphCOOView<VT, ET, WT> const& in_graph,
+  int const* vid,
+  int const* core_num,
   int k,
   int len,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 
 {
   cudaStream_t stream{nullptr};
@@ -112,14 +112,14 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph(
   thrust::scatter(
     rmm::exec_policy(stream)->on(stream), core_num, core_num + len, vid, sorted_core_num.begin());
 
-  VT *d_sorted_core_num = sorted_core_num.data().get();
+  VT* d_sorted_core_num = sorted_core_num.data().get();
 
   // Count number of edges in the input graph that satisfy kcore conditions
   // i.e. core_num[src] and core_num[dst] are both greater than or equal to k
   auto edge =
     thrust::make_zip_iterator(thrust::make_tuple(in_graph.src_indices, in_graph.dst_indices));
 
-  auto out_graph = std::make_unique<GraphCOO<VT, ET, WT>>(
+  auto out_graph = std::make_unique<legacy::GraphCOO<VT, ET, WT>>(
     in_graph.number_of_vertices,
     thrust::count_if(rmm::exec_policy(stream)->on(stream),
                      edge,
@@ -129,7 +129,7 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph(
     stream,
     mr);
 
-  GraphCOOView<VT, ET, WT> out_graph_view = out_graph->view();
+  legacy::GraphCOOView<VT, ET, WT> out_graph_view = out_graph->view();
   extract_edges(in_graph, out_graph_view, d_sorted_core_num, k);
 
   return out_graph;
@@ -138,41 +138,42 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph(
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-void core_number(GraphCSRView<VT, ET, WT> const &graph, VT *core_number)
+void core_number(legacy::GraphCSRView<VT, ET, WT> const& graph, VT* core_number)
 {
   return detail::core_number(graph, core_number);
 }
 
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> k_core(GraphCOOView<VT, ET, WT> const &in_graph,
-                                             int k,
-                                             VT const *vertex_id,
-                                             VT const *core_number,
-                                             VT num_vertex_ids,
-                                             rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> k_core(
+  legacy::GraphCOOView<VT, ET, WT> const& in_graph,
+  int k,
+  VT const* vertex_id,
+  VT const* core_number,
+  VT num_vertex_ids,
+  rmm::mr::device_memory_resource* mr)
 {
-  CUGRAPH_EXPECTS(vertex_id != nullptr, "Invalid API parameter: vertex_id is NULL");
-  CUGRAPH_EXPECTS(core_number != nullptr, "Invalid API parameter: core_number is NULL");
-  CUGRAPH_EXPECTS(k >= 0, "Invalid API parameter: k must be >= 0");
+  CUGRAPH_EXPECTS(vertex_id != nullptr, "Invalid input argument: vertex_id is NULL");
+  CUGRAPH_EXPECTS(core_number != nullptr, "Invalid input argument: core_number is NULL");
+  CUGRAPH_EXPECTS(k >= 0, "Invalid input argument: k must be >= 0");
 
   return detail::extract_subgraph(in_graph, vertex_id, core_number, k, num_vertex_ids, mr);
 }
 
-template void core_number<int32_t, int32_t, float>(GraphCSRView<int32_t, int32_t, float> const &,
-                                                   int32_t *core_number);
-template std::unique_ptr<GraphCOO<int32_t, int32_t, float>> k_core<int32_t, int32_t, float>(
-  GraphCOOView<int32_t, int32_t, float> const &,
-  int,
-  int32_t const *,
-  int32_t const *,
-  int32_t,
-  rmm::mr::device_memory_resource *);
-template std::unique_ptr<GraphCOO<int32_t, int32_t, double>> k_core<int32_t, int32_t, double>(
-  GraphCOOView<int32_t, int32_t, double> const &,
+template void core_number<int32_t, int32_t, float>(
+  legacy::GraphCSRView<int32_t, int32_t, float> const&, int32_t* core_number);
+template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, float>> k_core<int32_t, int32_t, float>(
+  legacy::GraphCOOView<int32_t, int32_t, float> const&,
   int,
-  int32_t const *,
-  int32_t const *,
+  int32_t const*,
+  int32_t const*,
   int32_t,
-  rmm::mr::device_memory_resource *);
+  rmm::mr::device_memory_resource*);
+template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, double>>
+k_core<int32_t, int32_t, double>(legacy::GraphCOOView<int32_t, int32_t, double> const&,
+                                 int,
+                                 int32_t const*,
+                                 int32_t const*,
+                                 int32_t,
+                                 rmm::mr::device_memory_resource*);
 
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
new file mode 100644
index 00000000000..adf5fdfbc11
--- /dev/null
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <raft/random/rng.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tuple>
+
+namespace cugraph {
+namespace detail {
+
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+shuffle_edgelist_by_edge(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& d_edgelist_rows,
+                         rmm::device_uvector<vertex_t>& d_edgelist_cols,
+                         std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
+                         bool store_transposed)
+{
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+
+  // TODO:  Make a shuffle_edges and shuffle_vertices out of these...
+  rmm::device_uvector<vertex_t> d_rx_edgelist_rows(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_rx_edgelist_cols(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_rx_edgelist_weights{std::nullopt};
+  if (d_edgelist_weights) {
+    auto edge_first = thrust::make_zip_iterator(
+      thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(),
+                         store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin(),
+                         (*d_edgelist_weights).begin()));
+
+    std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows,
+                                   store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols,
+                                   d_rx_edgelist_weights),
+                          std::ignore) =
+      cugraph::experimental::groupby_gpuid_and_shuffle_values(
+        comm,  // handle.get_comms(),
+        edge_first,
+        edge_first + d_edgelist_rows.size(),
+        [key_func =
+           cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+             comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+          return key_func(thrust::get<0>(val), thrust::get<1>(val));
+        },
+        handle.get_stream());
+  } else {
+    auto edge_first = thrust::make_zip_iterator(
+      thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(),
+                         store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin()));
+
+    std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows,
+                                   store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols),
+                          std::ignore) =
+      cugraph::experimental::groupby_gpuid_and_shuffle_values(
+        comm,  // handle.get_comms(),
+        edge_first,
+        edge_first + d_edgelist_rows.size(),
+        [key_func =
+           cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+             comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+          return key_func(thrust::get<0>(val), thrust::get<1>(val));
+        },
+        handle.get_stream());
+  }
+
+  return std::make_tuple(
+    std::move(d_rx_edgelist_rows), std::move(d_rx_edgelist_cols), std::move(d_rx_edgelist_weights));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+shuffle_edgelist_by_edge(raft::handle_t const& handle,
+                         rmm::device_uvector<int32_t>& d_edgelist_rows,
+                         rmm::device_uvector<int32_t>& d_edgelist_cols,
+                         std::optional<rmm::device_uvector<float>>& d_edgelist_weights,
+                         bool store_transposed);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+shuffle_edgelist_by_edge(raft::handle_t const& handle,
+                         rmm::device_uvector<int32_t>& d_edgelist_rows,
+                         rmm::device_uvector<int32_t>& d_edgelist_cols,
+                         std::optional<rmm::device_uvector<double>>& d_edgelist_weights,
+                         bool store_transposed);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+shuffle_edgelist_by_edge(raft::handle_t const& handle,
+                         rmm::device_uvector<int64_t>& d_edgelist_rows,
+                         rmm::device_uvector<int64_t>& d_edgelist_cols,
+                         std::optional<rmm::device_uvector<float>>& d_edgelist_weights,
+                         bool store_transposed);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+shuffle_edgelist_by_edge(raft::handle_t const& handle,
+                         rmm::device_uvector<int64_t>& d_edgelist_rows,
+                         rmm::device_uvector<int64_t>& d_edgelist_cols,
+                         std::optional<rmm::device_uvector<double>>& d_edgelist_weights,
+                         bool store_transposed);
+
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_vertices(raft::handle_t const& handle,
+                                               rmm::device_uvector<vertex_t>& d_vertices)
+{
+  auto& comm           = handle.get_comms();
+  auto const comm_size = comm.get_size();
+
+  rmm::device_uvector<vertex_t> d_rx_vertices(0, handle.get_stream());
+  std::tie(d_rx_vertices, std::ignore) = cugraph::experimental::groupby_gpuid_and_shuffle_values(
+    comm,  // handle.get_comms(),
+    d_vertices.begin(),
+    d_vertices.end(),
+    [key_func =
+       cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{
+         comm_size}] __device__(auto val) { return key_func(val); },
+    handle.get_stream());
+
+  return d_rx_vertices;
+}
+
+template rmm::device_uvector<int32_t> shuffle_vertices(raft::handle_t const& handle,
+                                                       rmm::device_uvector<int32_t>& d_vertices);
+
+template rmm::device_uvector<int64_t> shuffle_vertices(raft::handle_t const& handle,
+                                                       rmm::device_uvector<int64_t>& d_vertices);
+
+template <typename vertex_t, typename weight_t>
+rmm::device_uvector<size_t> groupby_and_count_by_edge(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>& d_edgelist_rows,
+  rmm::device_uvector<vertex_t>& d_edgelist_cols,
+  std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
+  size_t number_of_local_adj_matrix_partitions)
+{
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto const row_comm_rank = row_comm.get_rank();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+  auto const col_comm_rank = col_comm.get_rank();
+
+  auto local_partition_id_op =
+    [comm_size,
+     key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t<vertex_t>{
+       comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
+      return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
+             comm_size;  // global partition id to local partition id
+    };
+
+  auto pair_first =
+    thrust::make_zip_iterator(thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin()));
+
+  return d_edgelist_weights
+           ? cugraph::experimental::groupby_and_count(pair_first,
+                                                      pair_first + d_edgelist_rows.size(),
+                                                      d_edgelist_weights->begin(),
+                                                      local_partition_id_op,
+                                                      number_of_local_adj_matrix_partitions,
+                                                      handle.get_stream())
+           : cugraph::experimental::groupby_and_count(pair_first,
+                                                      pair_first + d_edgelist_rows.size(),
+                                                      local_partition_id_op,
+                                                      number_of_local_adj_matrix_partitions,
+                                                      handle.get_stream());
+}
+
+template rmm::device_uvector<size_t> groupby_and_count_by_edge(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>& d_edgelist_rows,
+  rmm::device_uvector<int32_t>& d_edgelist_cols,
+  std::optional<rmm::device_uvector<float>>& d_edgelist_weights,
+  size_t number_of_local_adj_matrix_partitions);
+
+template rmm::device_uvector<size_t> groupby_and_count_by_edge(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>& d_edgelist_rows,
+  rmm::device_uvector<int32_t>& d_edgelist_cols,
+  std::optional<rmm::device_uvector<double>>& d_edgelist_weights,
+  size_t number_of_local_adj_matrix_partitions);
+
+template rmm::device_uvector<size_t> groupby_and_count_by_edge(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>& d_edgelist_rows,
+  rmm::device_uvector<int64_t>& d_edgelist_cols,
+  std::optional<rmm::device_uvector<float>>& d_edgelist_weights,
+  size_t number_of_local_adj_matrix_partitions);
+
+template rmm::device_uvector<size_t> groupby_and_count_by_edge(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>& d_edgelist_rows,
+  rmm::device_uvector<int64_t>& d_edgelist_cols,
+  std::optional<rmm::device_uvector<double>>& d_edgelist_weights,
+  size_t number_of_local_adj_matrix_partitions);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/detail/utility_wrappers.cu b/cpp/src/detail/utility_wrappers.cu
new file mode 100644
index 00000000000..83a37d6b316
--- /dev/null
+++ b/cpp/src/detail/utility_wrappers.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cugraph/detail/utility_wrappers.hpp>
+
+#include <raft/random/rng.cuh>
+
+#include <thrust/sequence.h>
+#include <rmm/exec_policy.hpp>
+
+namespace cugraph {
+namespace detail {
+
+template <typename value_t>
+void uniform_random_fill(rmm::cuda_stream_view const& stream_view,
+                         value_t* d_value,
+                         size_t size,
+                         value_t min_value,
+                         value_t max_value,
+                         uint64_t seed)
+{
+  raft::random::Rng rng(seed);
+  rng.uniform<value_t, size_t>(d_value, size, min_value, max_value, stream_view.value());
+}
+
+template void uniform_random_fill(rmm::cuda_stream_view const& stream_view,
+                                  float* d_value,
+                                  size_t size,
+                                  float min_value,
+                                  float max_value,
+                                  uint64_t seed);
+
+template void uniform_random_fill(rmm::cuda_stream_view const& stream_view,
+                                  double* d_value,
+                                  size_t size,
+                                  double min_value,
+                                  double max_value,
+                                  uint64_t seed);
+
+template <typename value_t>
+void sequence_fill(rmm::cuda_stream_view const& stream_view,
+                   value_t* d_value,
+                   size_t size,
+                   value_t start_value)
+{
+  thrust::sequence(rmm::exec_policy(stream_view), d_value, d_value + size, start_value);
+}
+
+template void sequence_fill(rmm::cuda_stream_view const& stream_view,
+                            int32_t* d_value,
+                            size_t size,
+                            int32_t start_value);
+
+template void sequence_fill(rmm::cuda_stream_view const& stream_view,
+                            int64_t* d_value,
+                            size_t size,
+                            int64_t start_value);
+
+template <typename vertex_t>
+vertex_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
+                                   rmm::device_uvector<vertex_t> const& d_edgelist_rows,
+                                   rmm::device_uvector<vertex_t> const& d_edgelist_cols)
+{
+  auto edge_first =
+    thrust::make_zip_iterator(thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin()));
+
+  return thrust::transform_reduce(
+    rmm::exec_policy(stream_view),
+    edge_first,
+    edge_first + d_edgelist_rows.size(),
+    [] __device__(auto e) { return std::max(thrust::get<0>(e), thrust::get<1>(e)); },
+    vertex_t{0},
+    thrust::maximum<vertex_t>());
+}
+
+template int32_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
+                                           rmm::device_uvector<int32_t> const& d_edgelist_rows,
+                                           rmm::device_uvector<int32_t> const& d_edgelist_cols);
+
+template int64_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
+                                           rmm::device_uvector<int64_t> const& d_edgelist_rows,
+                                           rmm::device_uvector<int64_t> const& d_edgelist_cols);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu
index f297587a1d6..903228a79f6 100644
--- a/cpp/src/experimental/bfs.cu
+++ b/cpp/src/experimental/bfs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <algorithms.hpp>
-#include <experimental/graph_view.hpp>
-#include <patterns/reduce_op.cuh>
-#include <patterns/update_frontier_v_push_if_out_nbr.cuh>
-#include <patterns/vertex_frontier.cuh>
-#include <utilities/error.hpp>
-#include <vertex_partition_device.cuh>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/reduce_op.cuh>
+#include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
+#include <cugraph/prims/vertex_frontier.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
@@ -30,6 +30,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -41,9 +42,9 @@ namespace experimental {
 namespace detail {
 
 template <typename GraphViewType, typename PredecessorIterator>
-void bfs(raft::handle_t const &handle,
-         GraphViewType const &push_graph_view,
-         typename GraphViewType::vertex_type *distances,
+void bfs(raft::handle_t const& handle,
+         GraphViewType const& push_graph_view,
+         typename GraphViewType::vertex_type* distances,
          PredecessorIterator predecessor_first,
          typename GraphViewType::vertex_type source_vertex,
          bool direction_optimizing,
@@ -90,14 +91,12 @@ void bfs(raft::handle_t const &handle,
 
   // 3. initialize BFS frontier
 
-  enum class Bucket { cur, num_buckets };
-  std::vector<size_t> bucket_sizes(static_cast<size_t>(Bucket::num_buckets),
-                                   push_graph_view.get_number_of_local_vertices());
-  VertexFrontier<thrust::tuple<vertex_t>,
-                 vertex_t,
+  enum class Bucket { cur, next, num_buckets };
+  VertexFrontier<vertex_t,
+                 void,
                  GraphViewType::is_multi_gpu,
                  static_cast<size_t>(Bucket::num_buckets)>
-    vertex_frontier(handle, bucket_sizes);
+    vertex_frontier(handle);
 
   if (push_graph_view.is_local_vertex_nocheck(source_vertex)) {
     vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).insert(source_vertex);
@@ -106,23 +105,19 @@ void bfs(raft::handle_t const &handle,
   // 4. BFS iteration
 
   vertex_t depth{0};
-  auto cur_local_vertex_frontier_first =
-    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).begin();
-  auto cur_vertex_frontier_aggregate_size =
-    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).aggregate_size();
   while (true) {
     if (direction_optimizing) {
       CUGRAPH_FAIL("unimplemented.");
     } else {
-      vertex_partition_device_t<GraphViewType> vertex_partition(push_graph_view);
+      auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+        push_graph_view.get_vertex_partition_view());
 
-      auto cur_local_vertex_frontier_last =
-        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).end();
       update_frontier_v_push_if_out_nbr(
         handle,
         push_graph_view,
-        cur_local_vertex_frontier_first,
-        cur_local_vertex_frontier_last,
+        vertex_frontier,
+        static_cast<size_t>(Bucket::cur),
+        std::vector<size_t>{static_cast<size_t>(Bucket::next)},
         thrust::make_constant_iterator(0) /* dummy */,
         thrust::make_constant_iterator(0) /* dummy */,
         [vertex_partition, distances] __device__(
@@ -133,28 +128,27 @@ void bfs(raft::handle_t const &handle,
               *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(dst));
             if (distance != invalid_distance) { push = false; }
           }
-          // FIXME: need to test this works properly if payload size is 0 (returns a tuple of size
-          // 1)
-          return thrust::make_tuple(push, src);
+          return push ? thrust::optional<vertex_t>{src} : thrust::nullopt;
         },
-        reduce_op::any<thrust::tuple<vertex_t>>(),
+        reduce_op::any<vertex_t>(),
         distances,
         thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)),
-        vertex_frontier,
-        [depth] __device__(auto v_val, auto pushed_val) {
-          auto idx = (v_val == invalid_distance)
-                       ? static_cast<size_t>(Bucket::cur)
-                       : VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
-          return thrust::make_tuple(idx, depth + 1, thrust::get<0>(pushed_val));
+        [depth] __device__(auto v, auto v_val, auto pushed_val) {
+          return (v_val == invalid_distance)
+                   ? thrust::optional<
+                       thrust::tuple<size_t, thrust::tuple<vertex_t, vertex_t>>>{thrust::make_tuple(
+                       static_cast<size_t>(Bucket::next),
+                       thrust::make_tuple(depth + 1, pushed_val))}
+                   : thrust::nullopt;
         });
 
-      auto new_vertex_frontier_aggregate_size =
-        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).aggregate_size() -
-        cur_vertex_frontier_aggregate_size;
-      if (new_vertex_frontier_aggregate_size == 0) { break; }
-
-      cur_local_vertex_frontier_first = cur_local_vertex_frontier_last;
-      cur_vertex_frontier_aggregate_size += new_vertex_frontier_aggregate_size;
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).clear();
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).shrink_to_fit();
+      vertex_frontier.swap_buckets(static_cast<size_t>(Bucket::cur),
+                                   static_cast<size_t>(Bucket::next));
+      if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).aggregate_size() == 0) {
+        break;
+      }
     }
 
     depth++;
@@ -165,17 +159,15 @@ void bfs(raft::handle_t const &handle,
     handle.get_stream()));  // this is as necessary vertex_frontier will become out-of-scope once
                             // this function returns (FIXME: should I stream sync in VertexFrontier
                             // destructor?)
-
-  return;
 }
 
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void bfs(raft::handle_t const &handle,
-         graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
-         vertex_t *distances,
-         vertex_t *predecessors,
+void bfs(raft::handle_t const& handle,
+         graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+         vertex_t* distances,
+         vertex_t* predecessors,
          vertex_t source_vertex,
          bool direction_optimizing,
          vertex_t depth_limit,
@@ -204,109 +196,109 @@ void bfs(raft::handle_t const &handle,
 
 // explicit instantiation
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int32_t, float, false, true> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int32_t, double, false, true> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int64_t, float, false, true> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int64_t, double, false, true> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int64_t, int64_t, float, false, true> const &graph_view,
-                  int64_t *distances,
-                  int64_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
+                  int64_t* distances,
+                  int64_t* predecessors,
                   int64_t source_vertex,
                   bool direction_optimizing,
                   int64_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int64_t, int64_t, double, false, true> const &graph_view,
-                  int64_t *distances,
-                  int64_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
+                  int64_t* distances,
+                  int64_t* predecessors,
                   int64_t source_vertex,
                   bool direction_optimizing,
                   int64_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int32_t, double, false, false> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int64_t, float, false, false> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int32_t, int64_t, double, false, false> const &graph_view,
-                  int32_t *distances,
-                  int32_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
+                  int32_t* distances,
+                  int32_t* predecessors,
                   int32_t source_vertex,
                   bool direction_optimizing,
                   int32_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int64_t, int64_t, float, false, false> const &graph_view,
-                  int64_t *distances,
-                  int64_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
+                  int64_t* distances,
+                  int64_t* predecessors,
                   int64_t source_vertex,
                   bool direction_optimizing,
                   int64_t depth_limit,
                   bool do_expensive_check);
 
-template void bfs(raft::handle_t const &handle,
-                  graph_view_t<int64_t, int64_t, double, false, false> const &graph_view,
-                  int64_t *distances,
-                  int64_t *predecessors,
+template void bfs(raft::handle_t const& handle,
+                  graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
+                  int64_t* distances,
+                  int64_t* predecessors,
                   int64_t source_vertex,
                   bool direction_optimizing,
                   int64_t depth_limit,
diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu
new file mode 100644
index 00000000000..f34332464f2
--- /dev/null
+++ b/cpp/src/experimental/coarsen_graph.cu
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_barrier.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+decompress_matrix_partition_to_edgelist(
+  raft::handle_t const& handle,
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
+  std::optional<std::vector<vertex_t>> const& segment_offsets)
+{
+  auto number_of_edges = matrix_partition.get_number_of_edges();
+  rmm::device_uvector<vertex_t> edgelist_major_vertices(number_of_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_minor_vertices(number_of_edges, handle.get_stream());
+  auto edgelist_weights =
+    matrix_partition.get_weights()
+      ? std::make_optional<rmm::device_uvector<weight_t>>(number_of_edges, handle.get_stream())
+      : std::nullopt;
+
+  decompress_matrix_partition_to_fill_edgelist_majors(
+    handle, matrix_partition, edgelist_major_vertices.data(), segment_offsets);
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               matrix_partition.get_indices(),
+               matrix_partition.get_indices() + number_of_edges,
+               edgelist_minor_vertices.begin());
+  if (edgelist_weights) {
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 *(matrix_partition.get_weights()),
+                 *(matrix_partition.get_weights()) + number_of_edges,
+                 (*edgelist_weights).data());
+  }
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_major_vertices /* [INOUT] */,
+                                      vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                                      std::optional<weight_t*> edgelist_weights /* [INOUT] */,
+                                      edge_t number_of_edges,
+                                      cudaStream_t stream)
+{
+  auto pair_first =
+    thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
+
+  if (edgelist_weights) {
+    thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                        pair_first,
+                        pair_first + number_of_edges,
+                        *edgelist_weights);
+
+    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(number_of_edges, stream);
+    rmm::device_uvector<vertex_t> tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
+    auto it = thrust::reduce_by_key(
+      rmm::exec_policy(stream)->on(stream),
+      pair_first,
+      pair_first + number_of_edges,
+      (*edgelist_weights),
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
+                                                   tmp_edgelist_minor_vertices.begin())),
+      tmp_edgelist_weights.begin());
+    auto ret =
+      static_cast<edge_t>(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)));
+
+    auto edge_first =
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
+                                                   tmp_edgelist_minor_vertices.begin(),
+                                                   tmp_edgelist_weights.begin()));
+    thrust::copy(rmm::exec_policy(stream)->on(stream),
+                 edge_first,
+                 edge_first + ret,
+                 thrust::make_zip_iterator(thrust::make_tuple(
+                   edgelist_major_vertices, edgelist_minor_vertices, *edgelist_weights)));
+
+    return ret;
+  } else {
+    thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges);
+    return static_cast<edge_t>(thrust::distance(
+      pair_first,
+      thrust::unique(
+        rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges)));
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
+  raft::handle_t const& handle,
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
+  vertex_t const* p_major_labels,
+  vertex_t const* p_minor_labels,
+  std::optional<std::vector<vertex_t>> const& segment_offsets)
+{
+  // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
+  // compressed sparse format to save memory
+
+  auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] =
+    decompress_matrix_partition_to_edgelist(handle, matrix_partition, segment_offsets);
+
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    pair_first,
+                    pair_first + edgelist_major_vertices.size(),
+                    pair_first,
+                    [p_major_labels,
+                     p_minor_labels,
+                     major_first = matrix_partition.get_major_first(),
+                     minor_first = matrix_partition.get_minor_first()] __device__(auto val) {
+                      return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first],
+                                                p_minor_labels[thrust::get<1>(val) - minor_first]);
+                    });
+
+  auto number_of_edges = groupby_e_and_coarsen_edgelist(
+    edgelist_major_vertices.data(),
+    edgelist_minor_vertices.data(),
+    edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data()} : std::nullopt,
+    static_cast<edge_t>(edgelist_major_vertices.size()),
+    handle.get_stream());
+  edgelist_major_vertices.resize(number_of_edges, handle.get_stream());
+  edgelist_major_vertices.shrink_to_fit(handle.get_stream());
+  edgelist_minor_vertices.resize(number_of_edges, handle.get_stream());
+  edgelist_minor_vertices.shrink_to_fit(handle.get_stream());
+  if (edgelist_weights) {
+    (*edgelist_weights).resize(number_of_edges, handle.get_stream());
+    (*edgelist_weights).shrink_to_fit(handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+// multi-GPU version
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+             rmm::device_uvector<vertex_t>>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  vertex_t const* labels,
+  bool do_expensive_check)
+{
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto const row_comm_rank = row_comm.get_rank();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+  auto const col_comm_rank = col_comm.get_rank();
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  // 1. construct coarsened edge list
+
+  rmm::device_uvector<vertex_t> adj_matrix_minor_labels(
+    store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows()
+                     : graph_view.get_number_of_local_adj_matrix_partition_cols(),
+    handle.get_stream());
+  if (store_transposed) {
+    copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data());
+  } else {
+    copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data());
+  }
+
+  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_major_vertices{};
+  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_minor_vertices{};
+  auto coarsened_edgelist_weights =
+    graph_view.is_weighted() ? std::make_optional<std::vector<rmm::device_uvector<weight_t>>>({})
+                             : std::nullopt;
+  coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions());
+  coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size());
+  if (coarsened_edgelist_weights) {
+    (*coarsened_edgelist_weights).reserve(coarsened_edgelist_major_vertices.size());
+  }
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream());
+    coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream());
+    if (coarsened_edgelist_weights) {
+      (*coarsened_edgelist_weights).emplace_back(0, handle.get_stream());
+    }
+  }
+  // FIXME: we may compare performance/memory footprint with the hash_based approach especially when
+  // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst
+  // case). We may be able to limit the memory requirement close to the final coarsened edgelist
+  // with the hash based approach.
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    // 1-1. locally construct coarsened edge list
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+    rmm::device_uvector<vertex_t> major_labels(
+      store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i)
+                       : graph_view.get_number_of_local_adj_matrix_partition_rows(i),
+      handle.get_stream());
+    if (col_comm_rank == static_cast<int>(i)) {
+      // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for
+      // input
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   labels,
+                   labels + major_labels.size(),
+                   major_labels.begin());
+    }
+    device_bcast(col_comm,
+                 major_labels.data(),
+                 major_labels.data(),
+                 major_labels.size(),
+                 static_cast<int>(i),
+                 handle.get_stream());
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] =
+      decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
+        handle,
+        matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
+          graph_view.get_matrix_partition_view(i)),
+        major_labels.data(),
+        adj_matrix_minor_labels.data(),
+        graph_view.get_local_adj_matrix_partition_segment_offsets(i));
+
+    // 1-2. globally shuffle
+
+    std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) =
+      cugraph::detail::shuffle_edgelist_by_edge(
+        handle, edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, false);
+
+    // 1-3. append data to local adjacency matrix partitions
+
+    // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge
+    // list based on the final matrix partition (maybe add
+    // groupby_adj_matrix_partition_and_shuffle_values).
+
+    auto counts = cugraph::detail::groupby_and_count_by_edge(
+      handle,
+      edgelist_major_vertices,
+      edgelist_minor_vertices,
+      edgelist_weights,
+      graph_view.get_number_of_local_adj_matrix_partitions());
+
+    std::vector<size_t> h_counts(counts.size());
+    raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream());
+    handle.get_stream_view().synchronize();
+
+    std::vector<size_t> h_displacements(h_counts.size(), size_t{0});
+    std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1);
+
+    for (int j = 0; j < col_comm_size; ++j) {
+      auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
+        edgelist_major_vertices.begin() + h_displacements[j],
+        edgelist_minor_vertices.begin() + h_displacements[j],
+        edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data() + h_displacements[j]}
+                         : std::nullopt,
+        h_counts[j],
+        handle.get_stream());
+
+      auto cur_size = coarsened_edgelist_major_vertices[j].size();
+      // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we
+      // can reserve address space to avoid expensive reallocation.
+      // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
+      coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges,
+                                                  handle.get_stream());
+      coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(),
+                                                  handle.get_stream());
+      if (coarsened_edgelist_weights) {
+        (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_major_vertices[j].size(),
+                                                handle.get_stream());
+
+        auto src_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(),
+                                                       edgelist_minor_vertices.begin(),
+                                                       (*edgelist_weights).begin())) +
+          h_displacements[j];
+        auto dst_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(),
+                                                       coarsened_edgelist_minor_vertices[j].begin(),
+                                                       (*coarsened_edgelist_weights)[j].begin())) +
+          cur_size;
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     src_edge_first,
+                     src_edge_first + number_of_partition_edges,
+                     dst_edge_first);
+      } else {
+        auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+                                edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) +
+                              h_displacements[j];
+        auto dst_edge_first = thrust::make_zip_iterator(
+                                thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(),
+                                                   coarsened_edgelist_minor_vertices[j].begin())) +
+                              cur_size;
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     src_edge_first,
+                     src_edge_first + number_of_partition_edges,
+                     dst_edge_first);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) {
+    auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
+      coarsened_edgelist_major_vertices[i].data(),
+      coarsened_edgelist_minor_vertices[i].data(),
+      coarsened_edgelist_weights ? std::optional<weight_t*>{(*coarsened_edgelist_weights)[i].data()}
+                                 : std::nullopt,
+      static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size()),
+      handle.get_stream());
+    coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream());
+    coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream());
+    coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream());
+    coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream());
+    if (coarsened_edgelist_weights) {
+      (*coarsened_edgelist_weights)[i].resize(number_of_partition_edges, handle.get_stream());
+      (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream());
+    }
+  }
+
+  // 3. find unique labels for this GPU
+
+  rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_local_vertices(),
+                                              handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               labels,
+               labels + unique_labels.size(),
+               unique_labels.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               unique_labels.begin(),
+               unique_labels.end());
+  unique_labels.resize(
+    thrust::distance(unique_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    unique_labels.begin(),
+                                    unique_labels.end())),
+    handle.get_stream());
+
+  unique_labels = cugraph::detail::shuffle_vertices(handle, unique_labels);
+
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               unique_labels.begin(),
+               unique_labels.end());
+  unique_labels.resize(
+    thrust::distance(unique_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    unique_labels.begin(),
+                                    unique_labels.end())),
+    handle.get_stream());
+
+  // 4. renumber
+
+  rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
+  partition_t<vertex_t> partition(std::vector<vertex_t>(comm_size + 1, 0),
+                                  row_comm_size,
+                                  col_comm_size,
+                                  row_comm_rank,
+                                  col_comm_rank);
+  vertex_t number_of_vertices{};
+  edge_t number_of_edges{};
+  std::optional<std::vector<vertex_t>> segment_offsets{};
+  {
+    std::vector<vertex_t*> major_ptrs(coarsened_edgelist_major_vertices.size());
+    std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
+    std::vector<edge_t> counts(major_ptrs.size());
+    for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) {
+      major_ptrs[i] = coarsened_edgelist_major_vertices[i].data();
+      minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data();
+      counts[i]     = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
+    }
+    std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges, segment_offsets) =
+      renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+        handle,
+        std::optional<std::tuple<vertex_t const*, vertex_t>>{
+          std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
+        major_ptrs,
+        minor_ptrs,
+        counts,
+        do_expensive_check);
+  }
+
+  // 5. build a graph
+
+  std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists{};
+  edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions());
+  for (size_t i = 0; i < edgelists.size(); ++i) {
+    edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data()
+                                                   : coarsened_edgelist_major_vertices[i].data();
+    edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data()
+                                                   : coarsened_edgelist_minor_vertices[i].data();
+    edgelists[i].p_edge_weights =
+      coarsened_edgelist_weights
+        ? std::optional<weight_t const*>{(*coarsened_edgelist_weights)[i].data()}
+        : std::nullopt,
+    edgelists[i].number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
+  }
+
+  return std::make_tuple(
+    std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+      handle,
+      edgelists,
+      partition,
+      number_of_vertices,
+      number_of_edges,
+      graph_properties_t{graph_view.is_symmetric(), false},
+      segment_offsets),
+    std::move(renumber_map_labels));
+}
+
+// single-GPU version
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  !multi_gpu,
+  std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+             rmm::device_uvector<vertex_t>>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  vertex_t const* labels,
+  bool do_expensive_check)
+{
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  auto [coarsened_edgelist_major_vertices,
+        coarsened_edgelist_minor_vertices,
+        coarsened_edgelist_weights] =
+    decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
+      handle,
+      matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
+        graph_view.get_matrix_partition_view()),
+      labels,
+      labels,
+      graph_view.get_local_adj_matrix_partition_segment_offsets(0));
+
+  rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
+                                              handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               labels,
+               labels + unique_labels.size(),
+               unique_labels.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               unique_labels.begin(),
+               unique_labels.end());
+  unique_labels.resize(
+    thrust::distance(unique_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    unique_labels.begin(),
+                                    unique_labels.end())),
+    handle.get_stream());
+
+  auto [renumber_map_labels, segment_offsets] = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+    handle,
+    std::optional<std::tuple<vertex_t const*, vertex_t>>{
+      std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
+    coarsened_edgelist_major_vertices.data(),
+    coarsened_edgelist_minor_vertices.data(),
+    static_cast<edge_t>(coarsened_edgelist_major_vertices.size()),
+    do_expensive_check);
+
+  edgelist_t<vertex_t, edge_t, weight_t> edgelist{};
+  edgelist.p_src_vertices  = store_transposed ? coarsened_edgelist_minor_vertices.data()
+                                              : coarsened_edgelist_major_vertices.data();
+  edgelist.p_dst_vertices  = store_transposed ? coarsened_edgelist_major_vertices.data()
+                                              : coarsened_edgelist_minor_vertices.data();
+  edgelist.p_edge_weights  = coarsened_edgelist_weights
+                               ? std::optional<weight_t const*>{(*coarsened_edgelist_weights).data()}
+                               : std::nullopt;
+  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+
+  return std::make_tuple(
+    std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+      handle,
+      edgelist,
+      static_cast<vertex_t>(renumber_map_labels.size()),
+      graph_properties_t{graph_view.is_symmetric(), false},
+      segment_offsets),
+    std::move(renumber_map_labels));
+}
+
+}  // namespace detail
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+           rmm::device_uvector<vertex_t>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  vertex_t const* labels,
+  bool do_expensive_check)
+{
+  return detail::coarsen_graph(handle, graph_view, labels, do_expensive_check);
+}
+
+// explicit instantiation
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, float, true, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, float, true, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, float, true, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, float, true, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, true, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, float, true, true> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, false, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, true, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, float, true, false> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, false, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, double, true, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, double, true, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, double, true, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, double, true, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
+              int32_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, true, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, double, true, true> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, false, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, true, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, double, true, false> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, false, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const& handle,
+              graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
+              int64_t const* labels,
+              bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu
index 3a2b7126d22..980356a96db 100644
--- a/cpp/src/experimental/graph.cu
+++ b/cpp/src/experimental/graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include <experimental/detail/graph_utils.cuh>
-#include <experimental/graph.hpp>
-#include <partition_manager.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/adjacent_difference.h>
 #include <thrust/binary_search.h>
@@ -59,62 +59,46 @@ struct out_of_range_t {
   }
 };
 
+// compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
 template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
-std::
-  tuple<rmm::device_uvector<edge_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
-  edge_list_to_compressed_sparse(raft::handle_t const &handle,
-                                 edgelist_t<vertex_t, edge_t, weight_t> const &edgelist,
-                                 vertex_t major_first,
-                                 vertex_t major_last,
-                                 vertex_t minor_first,
-                                 vertex_t minor_last)
+std::tuple<rmm::device_uvector<edge_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+compress_edgelist(edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
+                  vertex_t major_first,
+                  std::optional<vertex_t> major_hypersparse_first,
+                  vertex_t major_last,
+                  vertex_t minor_first,
+                  vertex_t minor_last,
+                  rmm::cuda_stream_view stream_view)
 {
-  rmm::device_uvector<edge_t> offsets((major_last - major_first) + 1, handle.get_stream());
-  rmm::device_uvector<vertex_t> indices(edgelist.number_of_edges, handle.get_stream());
-  rmm::device_uvector<weight_t> weights(
-    edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, handle.get_stream());
-  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               offsets.begin(),
-               offsets.end(),
-               edge_t{0});
-  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               indices.begin(),
-               indices.end(),
-               vertex_t{0});
-
-  // FIXME: need to performance test this code with R-mat graphs having highly-skewed degree
-  // distribution. If there is a small number of vertices with very large degrees, atomicAdd can
-  // sequentialize execution. CUDA9+ & Kepler+ provide complier/architectural optimizations to
-  // mitigate this impact
-  // (https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/),
-  // and we need to check this thrust::for_each based approach delivers the expected performance.
-
-  // FIXME: also need to verify this approach is at least not significantly slower than the sorting
-  // based approach (this approach does not use extra memory, so better stick to this approach
-  // unless performance is significantly worse).
+  rmm::device_uvector<edge_t> offsets((major_last - major_first) + 1, stream_view);
+  rmm::device_uvector<vertex_t> indices(edgelist.number_of_edges, stream_view);
+  auto weights = edgelist.p_edge_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                             edgelist.number_of_edges, stream_view)
+                                         : std::nullopt;
+  thrust::fill(rmm::exec_policy(stream_view), offsets.begin(), offsets.end(), edge_t{0});
+  thrust::fill(rmm::exec_policy(stream_view), indices.begin(), indices.end(), vertex_t{0});
 
   auto p_offsets = offsets.data();
-  auto p_indices = indices.data();
-  auto p_weights =
-    edgelist.p_edge_weights != nullptr ? weights.data() : static_cast<weight_t *>(nullptr);
-
-  thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+  thrust::for_each(rmm::exec_policy(stream_view),
                    store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices,
                    store_transposed ? edgelist.p_dst_vertices + edgelist.number_of_edges
                                     : edgelist.p_src_vertices + edgelist.number_of_edges,
                    [p_offsets, major_first] __device__(auto v) {
                      atomicAdd(p_offsets + (v - major_first), edge_t{1});
                    });
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream_view), offsets.begin(), offsets.end(), offsets.begin());
 
-  thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                         offsets.begin(),
-                         offsets.end(),
-                         offsets.begin());
+  auto p_indices = indices.data();
+  if (edgelist.p_edge_weights) {
+    auto p_weights = (*weights).data();
 
-  if (edgelist.p_edge_weights != nullptr) {
     auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
-      edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights));
-    thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      edgelist.p_src_vertices, edgelist.p_dst_vertices, *(edgelist.p_edge_weights)));
+    thrust::for_each(rmm::exec_policy(stream_view),
                      edge_first,
                      edge_first + edgelist.number_of_edges,
                      [p_offsets, p_indices, p_weights, major_first] __device__(auto e) {
@@ -137,10 +121,10 @@ std::
   } else {
     auto edge_first = thrust::make_zip_iterator(
       thrust::make_tuple(edgelist.p_src_vertices, edgelist.p_dst_vertices));
-    thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::for_each(rmm::exec_policy(stream_view),
                      edge_first,
                      edge_first + edgelist.number_of_edges,
-                     [p_offsets, p_indices, p_weights, major_first] __device__(auto e) {
+                     [p_offsets, p_indices, major_first] __device__(auto e) {
                        auto s      = thrust::get<0>(e);
                        auto d      = thrust::get<1>(e);
                        auto major  = store_transposed ? d : s;
@@ -157,45 +141,51 @@ std::
                      });
   }
 
-  // FIXME: need to add an option to sort neighbor lists
-
-  return std::make_tuple(std::move(offsets), std::move(indices), std::move(weights));
-}
-
-template <typename vertex_t, typename DegreeIterator, typename ThresholdIterator>
-std::vector<vertex_t> segment_degree_sorted_vertex_partition(raft::handle_t const &handle,
-                                                             DegreeIterator degree_first,
-                                                             DegreeIterator degree_last,
-                                                             ThresholdIterator threshold_first,
-                                                             ThresholdIterator threshold_last)
-{
-  auto num_elements = thrust::distance(degree_first, degree_last);
-  auto num_segments = thrust::distance(threshold_first, threshold_last) + 1;
-
-  std::vector<vertex_t> h_segment_offsets(num_segments + 1);
-  h_segment_offsets[0]     = 0;
-  h_segment_offsets.back() = num_elements;
-
-  rmm::device_uvector<vertex_t> d_segment_offsets(num_segments - 1, handle.get_stream());
-
-  thrust::upper_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                      degree_first,
-                      degree_last,
-                      threshold_first,
-                      threshold_last,
-                      d_segment_offsets.begin());
-
-  raft::update_host(h_segment_offsets.begin() + 1,
-                    d_segment_offsets.begin(),
-                    d_segment_offsets.size(),
-                    handle.get_stream());
+  auto dcs_nzd_vertices = major_hypersparse_first
+                            ? std::make_optional<rmm::device_uvector<vertex_t>>(
+                                major_last - *major_hypersparse_first, stream_view)
+                            : std::nullopt;
+  if (dcs_nzd_vertices) {
+    auto constexpr invalid_vertex = invalid_vertex_id<vertex_t>::value;
+
+    thrust::transform(
+      rmm::exec_policy(stream_view),
+      thrust::make_counting_iterator(*major_hypersparse_first),
+      thrust::make_counting_iterator(major_last),
+      (*dcs_nzd_vertices).begin(),
+      [major_first, offsets = offsets.data()] __device__(auto major) {
+        auto major_offset = major - major_first;
+        return offsets[major_offset + 1] - offsets[major_offset] > 0 ? major : invalid_vertex;
+      });
+
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      (*dcs_nzd_vertices).begin(), offsets.begin() + (*major_hypersparse_first - major_first)));
+    (*dcs_nzd_vertices)
+      .resize(thrust::distance(pair_first,
+                               thrust::remove_if(rmm::exec_policy(stream_view),
+                                                 pair_first,
+                                                 pair_first + (*dcs_nzd_vertices).size(),
+                                                 [] __device__(auto pair) {
+                                                   return thrust::get<0>(pair) == invalid_vertex;
+                                                 })),
+              stream_view);
+    (*dcs_nzd_vertices).shrink_to_fit(stream_view);
+    if (static_cast<vertex_t>((*dcs_nzd_vertices).size()) < major_last - *major_hypersparse_first) {
+      thrust::copy(
+        rmm::exec_policy(stream_view),
+        offsets.begin() + (major_last - major_first),
+        offsets.end(),
+        offsets.begin() + (*major_hypersparse_first - major_first) + (*dcs_nzd_vertices).size());
+      offsets.resize((*major_hypersparse_first - major_first) + (*dcs_nzd_vertices).size() + 1,
+                     stream_view);
+      offsets.shrink_to_fit(stream_view);
+    }
+  }
 
-  CUDA_TRY(cudaStreamSynchronize(
-    handle.get_stream()));  // this is necessary as d_segment_offsets will become out-of-scope once
-                            // this function returns and this function returns a host variable which
-                            // can be used right after return.
+  // FIXME: need to add an option to sort neighbor lists
 
-  return h_segment_offsets;
+  return std::make_tuple(
+    std::move(offsets), std::move(indices), std::move(weights), std::move(dcs_nzd_vertices));
 }
 
 }  // namespace
@@ -206,13 +196,13 @@ template <typename vertex_t,
           bool store_transposed,
           bool multi_gpu>
 graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
-  graph_t(raft::handle_t const &handle,
-          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const &edgelists,
-          partition_t<vertex_t> const &partition,
+  graph_t(raft::handle_t const& handle,
+          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
+          partition_t<vertex_t> const& partition,
           vertex_t number_of_vertices,
           edge_t number_of_edges,
           graph_properties_t properties,
-          bool sorted_by_global_degree_within_vertex_partition,
+          std::optional<std::vector<vertex_t>> const& segment_offsets,
           bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, number_of_vertices, number_of_edges, properties),
@@ -220,52 +210,53 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 {
   // cheap error checks
 
-  auto &comm           = this->get_handle_ptr()->get_comms();
+  auto& comm           = this->get_handle_ptr()->get_comms();
   auto const comm_size = comm.get_size();
-  auto &row_comm =
+  auto& row_comm =
     this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
   auto const row_comm_rank = row_comm.get_rank();
   auto const row_comm_size = row_comm.get_size();
-  auto &col_comm =
+  auto& col_comm =
     this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
   auto const col_comm_rank = col_comm.get_rank();
   auto const col_comm_size = col_comm.get_size();
-  auto default_stream      = this->get_handle_ptr()->get_stream();
+  auto default_stream_view = this->get_handle_ptr()->get_stream_view();
 
-  CUGRAPH_EXPECTS(edgelists.size() > 0,
-                  "Invalid API parameter: edgelists.size() should be non-zero.");
+  CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(col_comm_size),
+                  "Invalid input argument: errneous edgelists.size().");
+  CUGRAPH_EXPECTS(
+    !segment_offsets.has_value() ||
+      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)) ||
+      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
+    "Invalid input argument: segment_offsets.size() returns an invalid value.");
 
-  bool is_weighted = edgelists[0].p_edge_weights != nullptr;
+  auto is_weighted = edgelists[0].p_edge_weights.has_value();
+  auto use_dcs =
+    segment_offsets
+      ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
+      : false;
 
   CUGRAPH_EXPECTS(
-    std::any_of(edgelists.begin() + 1,
+    std::any_of(edgelists.begin(),
                 edgelists.end(),
                 [is_weighted](auto edgelist) {
-                  return (edgelist.p_src_vertices == nullptr) ||
-                         (edgelist.p_dst_vertices == nullptr) ||
-                         (is_weighted && (edgelist.p_edge_weights == nullptr)) ||
-                         (!is_weighted && (edgelist.p_edge_weights != nullptr));
+                  return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) ||
+                         ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) ||
+                         (is_weighted && (edgelist.number_of_edges > 0) &&
+                          ((edgelist.p_edge_weights.has_value() == false) ||
+                           (*(edgelist.p_edge_weights) == nullptr)));
                 }) == false,
-    "Invalid API parameter: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
-    "be nullptr and edgelists[].p_edge_weights should be nullptr (if edgelists[0].p_edge_weights "
-    "is nullptr) or should not be nullptr (otherwise).");
-
-  CUGRAPH_EXPECTS((partition.is_hypergraph_partitioned() &&
-                   (edgelists.size() == static_cast<size_t>(col_comm_size))) ||
-                    (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)),
-                  "Invalid API parameter: errneous edgelists.size().");
+    "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
+    "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be "
+    "neither std::nullopt nor nullptr if weighted and edgelists[].number_of_edges >  0.");
 
-  // optional expensive checks (part 1/3)
+  // optional expensive checks (part 1/2)
 
   if (do_expensive_check) {
     edge_t number_of_local_edges_sum{};
     for (size_t i = 0; i < edgelists.size(); ++i) {
-      vertex_t major_first{};
-      vertex_t major_last{};
-      vertex_t minor_first{};
-      vertex_t minor_last{};
-      std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(i);
-      std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range();
+      auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
+      auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
 
       number_of_local_edges_sum += edgelists[i].number_of_edges;
 
@@ -273,126 +264,95 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
         store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices,
         store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices));
       // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
-      CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(default_stream)->on(default_stream),
+      CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(default_stream_view),
                                        edge_first,
                                        edge_first + edgelists[i].number_of_edges,
                                        out_of_range_t<vertex_t>{
                                          major_first, major_last, minor_first, minor_last}) == 0,
-                      "Invalid API parameter: edgelists[] have out-of-range values.");
+                      "Invalid input argument: edgelists[] have out-of-range values.");
     }
     number_of_local_edges_sum =
-      host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream);
-    CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
-                    "Invalid API parameter: the sum of local edges doe counts not match with "
-                    "number_of_local_edges.");
+      host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream_view.value());
+    CUGRAPH_EXPECTS(
+      number_of_local_edges_sum == this->get_number_of_edges(),
+      "Invalid input argument: the sum of local edge counts does not match with number_of_edges.");
 
     CUGRAPH_EXPECTS(
       partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
-      "Invalid API parameter: vertex partition should cover [0, number_of_vertices).");
+      "Invalid input argument: vertex partition should cover [0, number_of_vertices).");
   }
 
-  // convert edge list (COO) to compressed sparse format (CSR or CSC)
+  // aggregate segment_offsets
+
+  if (segment_offsets) {
+    // FIXME: we need to add host_allgather
+    rmm::device_uvector<vertex_t> d_segment_offsets((*segment_offsets).size(), default_stream_view);
+    raft::update_device(d_segment_offsets.data(),
+                        (*segment_offsets).data(),
+                        (*segment_offsets).size(),
+                        default_stream_view.value());
+    rmm::device_uvector<vertex_t> d_aggregate_segment_offsets(
+      col_comm_size * d_segment_offsets.size(), default_stream_view);
+    col_comm.allgather(d_segment_offsets.data(),
+                       d_aggregate_segment_offsets.data(),
+                       d_segment_offsets.size(),
+                       default_stream_view.value());
+
+    adj_matrix_partition_segment_offsets_ =
+      std::vector<vertex_t>(d_aggregate_segment_offsets.size(), vertex_t{0});
+    raft::update_host((*adj_matrix_partition_segment_offsets_).data(),
+                      d_aggregate_segment_offsets.data(),
+                      d_aggregate_segment_offsets.size(),
+                      default_stream_view.value());
+
+    default_stream_view
+      .synchronize();  // this is necessary as adj_matrix_partition_segment_offsets_ can be used
+                       // right after return.
+  }
+
+  // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
 
   adj_matrix_partition_offsets_.reserve(edgelists.size());
   adj_matrix_partition_indices_.reserve(edgelists.size());
-  adj_matrix_partition_weights_.reserve(is_weighted ? edgelists.size() : 0);
+  if (is_weighted) {
+    adj_matrix_partition_weights_ = std::vector<rmm::device_uvector<weight_t>>{};
+    (*adj_matrix_partition_weights_).reserve(edgelists.size());
+  }
+  if (use_dcs) {
+    adj_matrix_partition_dcs_nzd_vertices_      = std::vector<rmm::device_uvector<vertex_t>>{};
+    adj_matrix_partition_dcs_nzd_vertex_counts_ = std::vector<vertex_t>{};
+    (*adj_matrix_partition_dcs_nzd_vertices_).reserve(edgelists.size());
+    (*adj_matrix_partition_dcs_nzd_vertex_counts_).reserve(edgelists.size());
+  }
   for (size_t i = 0; i < edgelists.size(); ++i) {
-    vertex_t major_first{};
-    vertex_t major_last{};
-    vertex_t minor_first{};
-    vertex_t minor_last{};
-    std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(i);
-    std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range();
-
-    rmm::device_uvector<edge_t> offsets(0, default_stream);
-    rmm::device_uvector<vertex_t> indices(0, default_stream);
-    rmm::device_uvector<weight_t> weights(0, default_stream);
-    std::tie(offsets, indices, weights) = edge_list_to_compressed_sparse<store_transposed>(
-      *(this->get_handle_ptr()), edgelists[i], major_first, major_last, minor_first, minor_last);
+    auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
+    auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
+    auto major_hypersparse_first =
+      use_dcs ? std::optional<vertex_t>{major_first +
+                                        (*adj_matrix_partition_segment_offsets_)
+                                          [(*segment_offsets).size() * i +
+                                           detail::num_sparse_segments_per_vertex_partition]}
+              : std::nullopt;
+    auto [offsets, indices, weights, dcs_nzd_vertices] =
+      compress_edgelist<store_transposed>(edgelists[i],
+                                          major_first,
+                                          major_hypersparse_first,
+                                          major_last,
+                                          minor_first,
+                                          minor_last,
+                                          default_stream_view);
+
     adj_matrix_partition_offsets_.push_back(std::move(offsets));
     adj_matrix_partition_indices_.push_back(std::move(indices));
-    if (is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); }
-  }
-
-  // update degree-based segment offsets (to be used for graph analytics kernel optimization)
-
-  if (sorted_by_global_degree_within_vertex_partition) {
-    auto degrees = detail::compute_major_degree(
-      *(this->get_handle_ptr()), adj_matrix_partition_offsets_, partition_);
-
-    // optional expensive checks (part 2/3)
-
-    if (do_expensive_check) {
-      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
-                                        degrees.begin(),
-                                        degrees.end(),
-                                        thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_global_degree_within_vertex_partition is "
-                      "set to true, but degrees are not non-ascending.");
+    if (is_weighted) { (*adj_matrix_partition_weights_).push_back(std::move(*weights)); }
+    if (use_dcs) {
+      auto dcs_nzd_vertex_count = static_cast<vertex_t>((*dcs_nzd_vertices).size());
+      (*adj_matrix_partition_dcs_nzd_vertices_).push_back(std::move(*dcs_nzd_vertices));
+      (*adj_matrix_partition_dcs_nzd_vertex_counts_).push_back(dcs_nzd_vertex_count);
     }
-
-    static_assert(detail::num_segments_per_vertex_partition == 3);
-    static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) &&
-                  (detail::mid_degree_threshold <= std::numeric_limits<edge_t>::max()));
-    rmm::device_uvector<edge_t> d_thresholds(detail::num_segments_per_vertex_partition - 1,
-                                             default_stream);
-    std::vector<edge_t> h_thresholds = {static_cast<edge_t>(detail::low_degree_threshold),
-                                        static_cast<edge_t>(detail::mid_degree_threshold)};
-    raft::update_device(
-      d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream);
-
-    rmm::device_uvector<vertex_t> segment_offsets(detail::num_segments_per_vertex_partition + 1,
-                                                  default_stream);
-    segment_offsets.set_element_async(0, 0, default_stream);
-    segment_offsets.set_element_async(
-      detail::num_segments_per_vertex_partition, degrees.size(), default_stream);
-
-    thrust::upper_bound(rmm::exec_policy(default_stream)->on(default_stream),
-                        degrees.begin(),
-                        degrees.end(),
-                        d_thresholds.begin(),
-                        d_thresholds.end(),
-                        segment_offsets.begin() + 1);
-
-    rmm::device_uvector<vertex_t> aggregate_segment_offsets(0, default_stream);
-    if (partition.is_hypergraph_partitioned()) {
-      rmm::device_uvector<vertex_t> aggregate_segment_offsets(
-        col_comm_size * segment_offsets.size(), default_stream);
-      col_comm.allgather(segment_offsets.data(),
-                         aggregate_segment_offsets.data(),
-                         segment_offsets.size(),
-                         default_stream);
-    } else {
-      rmm::device_uvector<vertex_t> aggregate_segment_offsets(
-        row_comm_size * segment_offsets.size(), default_stream);
-      row_comm.allgather(segment_offsets.data(),
-                         aggregate_segment_offsets.data(),
-                         segment_offsets.size(),
-                         default_stream);
-    }
-
-    vertex_partition_segment_offsets_.resize(aggregate_segment_offsets.size());
-    raft::update_host(vertex_partition_segment_offsets_.data(),
-                      aggregate_segment_offsets.data(),
-                      aggregate_segment_offsets.size(),
-                      default_stream);
-
-    raft::comms::status_t status{};
-    if (partition.is_hypergraph_partitioned()) {
-      status = col_comm.sync_stream(
-        default_stream);  // this is necessary as degrees, d_thresholds, and segment_offsets will
-                          // become out-of-scope once control flow exits this block and
-                          // vertex_partition_segment_offsets_ can be used right after return.
-    } else {
-      status = row_comm.sync_stream(
-        default_stream);  // this is necessary as degrees, d_thresholds, and segment_offsets will
-                          // become out-of-scope once control flow exits this block and
-                          // vertex_partition_segment_offsets_ can be used right after return.
-    }
-    CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
   }
 
-  // optional expensive checks (part 3/3)
+  // optional expensive checks (part 2/2)
 
   if (do_expensive_check) {
     // FIXME: check for symmetricity may better be implemetned with transpose().
@@ -409,26 +369,37 @@ template <typename vertex_t,
           bool store_transposed,
           bool multi_gpu>
 graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
-  graph_t(raft::handle_t const &handle,
-          edgelist_t<vertex_t, edge_t, weight_t> const &edgelist,
+  graph_t(raft::handle_t const& handle,
+          edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
           vertex_t number_of_vertices,
           graph_properties_t properties,
-          bool sorted_by_degree,
+          std::optional<std::vector<vertex_t>> const& segment_offsets,
           bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, number_of_vertices, edgelist.number_of_edges, properties),
-    offsets_(rmm::device_uvector<edge_t>(0, handle.get_stream())),
-    indices_(rmm::device_uvector<vertex_t>(0, handle.get_stream())),
-    weights_(rmm::device_uvector<weight_t>(0, handle.get_stream()))
+    offsets_(rmm::device_uvector<edge_t>(0, handle.get_stream_view())),
+    indices_(rmm::device_uvector<vertex_t>(0, handle.get_stream_view())),
+    segment_offsets_(segment_offsets)
 {
   // cheap error checks
 
-  auto default_stream = this->get_handle_ptr()->get_stream();
+  auto default_stream_view = this->get_handle_ptr()->get_stream_view();
+
+  auto is_weighted = edgelist.p_edge_weights.has_value();
 
   CUGRAPH_EXPECTS(
-    (edgelist.p_src_vertices != nullptr) && (edgelist.p_dst_vertices != nullptr),
-    "Invalid API parameter: edgelist.p_src_vertices and edgelist.p_dst_vertices should "
-    "not be nullptr.");
+    ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) &&
+      ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) &&
+      (!is_weighted || (is_weighted && ((edgelist.number_of_edges == 0) ||
+                                        (*(edgelist.p_edge_weights) != nullptr)))),
+    "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should not be "
+    "nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be neither "
+    "std::nullopt nor nullptr if weighted and edgelist.number_of_edges > 0.");
+
+  CUGRAPH_EXPECTS(
+    !segment_offsets.has_value() ||
+      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
+    "Invalid input argument: segment_offsets.size() returns an invalid value.");
 
   // optional expensive checks (part 1/2)
 
@@ -438,12 +409,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                          store_transposed ? edgelist.p_src_vertices : edgelist.p_dst_vertices));
     // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
     CUGRAPH_EXPECTS(thrust::count_if(
-                      rmm::exec_policy(default_stream)->on(default_stream),
+                      rmm::exec_policy(default_stream_view),
                       edge_first,
                       edge_first + edgelist.number_of_edges,
                       out_of_range_t<vertex_t>{
                         0, this->get_number_of_vertices(), 0, this->get_number_of_vertices()}) == 0,
-                    "Invalid API parameter: edgelist have out-of-range values.");
+                    "Invalid input argument: edgelist have out-of-range values.");
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
     if (this->is_symmetric()) {}
@@ -454,64 +425,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   // convert edge list (COO) to compressed sparse format (CSR or CSC)
 
-  std::tie(offsets_, indices_, weights_) =
-    edge_list_to_compressed_sparse<store_transposed>(*(this->get_handle_ptr()),
-                                                     edgelist,
-                                                     vertex_t{0},
-                                                     this->get_number_of_vertices(),
-                                                     vertex_t{0},
-                                                     this->get_number_of_vertices());
-
-  // update degree-based segment offsets (to be used for graph analytics kernel optimization)
-
-  if (sorted_by_degree) {
-    auto degree_first = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(vertex_t{0}),
-      detail::degree_from_offsets_t<vertex_t, edge_t>{offsets_.data()});
-
-    // optional expensive checks (part 2/2)
-
-    if (do_expensive_check) {
-      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
-                                        degree_first,
-                                        degree_first + this->get_number_of_vertices(),
-                                        thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_degree is set to true, but degrees are not "
-                      "non-ascending.");
-    }
-
-    static_assert(detail::num_segments_per_vertex_partition == 3);
-    static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) &&
-                  (detail::mid_degree_threshold <= std::numeric_limits<edge_t>::max()));
-    rmm::device_uvector<edge_t> d_thresholds(detail::num_segments_per_vertex_partition - 1,
-                                             default_stream);
-    std::vector<edge_t> h_thresholds = {static_cast<edge_t>(detail::low_degree_threshold),
-                                        static_cast<edge_t>(detail::mid_degree_threshold)};
-    raft::update_device(
-      d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream);
-
-    rmm::device_uvector<vertex_t> segment_offsets(detail::num_segments_per_vertex_partition + 1,
-                                                  default_stream);
-    segment_offsets.set_element_async(0, 0, default_stream);
-    segment_offsets.set_element_async(
-      detail::num_segments_per_vertex_partition, this->get_number_of_vertices(), default_stream);
-
-    thrust::upper_bound(rmm::exec_policy(default_stream)->on(default_stream),
-                        degree_first,
-                        degree_first + this->get_number_of_vertices(),
-                        d_thresholds.begin(),
-                        d_thresholds.end(),
-                        segment_offsets.begin() + 1);
-
-    segment_offsets_.resize(segment_offsets.size());
-    raft::update_host(
-      segment_offsets_.data(), segment_offsets.data(), segment_offsets.size(), default_stream);
-
-    CUDA_TRY(cudaStreamSynchronize(
-      default_stream));  // this is necessary as d_thresholds and segment_offsets will become
-                         // out-of-scpe once control flow exits this block and segment_offsets_ can
-                         // be used right after return.
-  }
+  std::tie(offsets_, indices_, weights_, std::ignore) =
+    compress_edgelist<store_transposed>(edgelist,
+                                        vertex_t{0},
+                                        std::optional<vertex_t>{std::nullopt},
+                                        this->get_number_of_vertices(),
+                                        vertex_t{0},
+                                        this->get_number_of_vertices(),
+                                        default_stream_view);
 
   // optional expensive checks (part 3/3)
 
@@ -555,4 +476,4 @@ template class graph_t<int64_t, int64_t, double, false, false>;
 }  // namespace experimental
 }  // namespace cugraph
 
-#include <experimental/eidir_graph.hpp>
+#include <cugraph/experimental/eidir_graph.hpp>
diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu
index 04d2ea990df..70dd6a326ac 100644
--- a/cpp/src/experimental/graph_view.cu
+++ b/cpp/src/experimental/graph_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <experimental/detail/graph_utils.cuh>
-#include <experimental/graph_view.hpp>
-#include <partition_manager.hpp>
-#include <utilities/comm_utils.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
 
 #include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -49,6 +51,110 @@ struct out_of_range_t {
   __device__ bool operator()(vertex_t v) { return (v < min) || (v >= max); }
 };
 
+template <typename vertex_t, typename edge_t>
+std::vector<edge_t> update_adj_matrix_partition_edge_counts(
+  std::vector<edge_t const*> const& adj_matrix_partition_offsets,
+  std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts,
+  partition_t<vertex_t> const& partition,
+  std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+  cudaStream_t stream)
+{
+  std::vector<edge_t> adj_matrix_partition_edge_counts(partition.get_number_of_matrix_partitions(),
+                                                       0);
+  auto use_dcs = adj_matrix_partition_dcs_nzd_vertex_counts.has_value();
+  for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
+    auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
+    raft::update_host(&(adj_matrix_partition_edge_counts[i]),
+                      adj_matrix_partition_offsets[i] +
+                        (use_dcs ? ((*adj_matrix_partition_segment_offsets)
+                                      [(detail::num_sparse_segments_per_vertex_partition + 2) * i +
+                                       detail::num_sparse_segments_per_vertex_partition] +
+                                    (*adj_matrix_partition_dcs_nzd_vertex_counts)[i])
+                                 : (major_last - major_first)),
+                      1,
+                      stream);
+  }
+  CUDA_TRY(cudaStreamSynchronize(stream));
+  return adj_matrix_partition_edge_counts;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<edge_t> compute_minor_degrees(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view)
+{
+  rmm::device_uvector<edge_t> minor_degrees(graph_view.get_number_of_local_vertices(),
+                                            handle.get_stream());
+  if (store_transposed) {
+    copy_v_transform_reduce_out_nbr(
+      handle,
+      graph_view,
+      thrust::make_constant_iterator(0) /* dummy */,
+      thrust::make_constant_iterator(0) /* dummy */,
+      [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return edge_t{1};
+      },
+      edge_t{0},
+      minor_degrees.data());
+  } else {
+    copy_v_transform_reduce_in_nbr(
+      handle,
+      graph_view,
+      thrust::make_constant_iterator(0) /* dummy */,
+      thrust::make_constant_iterator(0) /* dummy */,
+      [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return edge_t{1};
+      },
+      edge_t{0},
+      minor_degrees.data());
+  }
+
+  return minor_degrees;
+}
+
+template <bool major,
+          typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<weight_t> compute_weight_sums(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view)
+{
+  rmm::device_uvector<weight_t> weight_sums(graph_view.get_number_of_local_vertices(),
+                                            handle.get_stream());
+  if (major == store_transposed) {
+    copy_v_transform_reduce_in_nbr(
+      handle,
+      graph_view,
+      thrust::make_constant_iterator(0) /* dummy */,
+      thrust::make_constant_iterator(0) /* dummy */,
+      [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return w;
+      },
+      weight_t{0.0},
+      weight_sums.data());
+  } else {
+    copy_v_transform_reduce_out_nbr(
+      handle,
+      graph_view,
+      thrust::make_constant_iterator(0) /* dummy */,
+      thrust::make_constant_iterator(0) /* dummy */,
+      [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return w;
+      },
+      weight_t{0.0},
+      weight_sums.data());
+  }
+
+  return weight_sums;
+}
+
 }  // namespace
 
 template <typename vertex_t,
@@ -57,24 +163,34 @@ template <typename vertex_t,
           bool store_transposed,
           bool multi_gpu>
 graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& adj_matrix_partition_offsets,
-               std::vector<vertex_t const*> const& adj_matrix_partition_indices,
-               std::vector<weight_t const*> const& adj_matrix_partition_weights,
-               std::vector<vertex_t> const& vertex_partition_segment_offsets,
-               partition_t<vertex_t> const& partition,
-               vertex_t number_of_vertices,
-               edge_t number_of_edges,
-               graph_properties_t properties,
-               bool sorted_by_global_degree_within_vertex_partition,
-               bool do_expensive_check)
+  graph_view_t(
+    raft::handle_t const& handle,
+    std::vector<edge_t const*> const& adj_matrix_partition_offsets,
+    std::vector<vertex_t const*> const& adj_matrix_partition_indices,
+    std::optional<std::vector<weight_t const*>> const& adj_matrix_partition_weights,
+    std::optional<std::vector<vertex_t const*>> const& adj_matrix_partition_dcs_nzd_vertices,
+    std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts,
+    partition_t<vertex_t> const& partition,
+    vertex_t number_of_vertices,
+    edge_t number_of_edges,
+    graph_properties_t properties,
+    std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+    bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, number_of_vertices, number_of_edges, properties),
     adj_matrix_partition_offsets_(adj_matrix_partition_offsets),
     adj_matrix_partition_indices_(adj_matrix_partition_indices),
     adj_matrix_partition_weights_(adj_matrix_partition_weights),
+    adj_matrix_partition_dcs_nzd_vertices_(adj_matrix_partition_dcs_nzd_vertices),
+    adj_matrix_partition_dcs_nzd_vertex_counts_(adj_matrix_partition_dcs_nzd_vertex_counts),
+    adj_matrix_partition_number_of_edges_(
+      update_adj_matrix_partition_edge_counts(adj_matrix_partition_offsets,
+                                              adj_matrix_partition_dcs_nzd_vertex_counts,
+                                              partition,
+                                              adj_matrix_partition_segment_offsets,
+                                              handle.get_stream())),
     partition_(partition),
-    vertex_partition_segment_offsets_(vertex_partition_segment_offsets)
+    adj_matrix_partition_segment_offsets_(adj_matrix_partition_segment_offsets)
 {
   // cheap error checks
 
@@ -86,33 +202,41 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                                ->get_subcomm(cugraph::partition_2d::key_naming_t().col_name())
                                .get_size();
 
+  auto is_weighted = adj_matrix_partition_weights.has_value();
+  auto use_dcs     = adj_matrix_partition_dcs_nzd_vertices.has_value();
+
   CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == adj_matrix_partition_indices.size(),
-                  "Invalid API parameter: adj_matrix_partition_offsets.size() and "
+                  "Internal Error: adj_matrix_partition_offsets.size() and "
                   "adj_matrix_partition_indices.size() should coincide.");
-  CUGRAPH_EXPECTS((adj_matrix_partition_weights.size() == adj_matrix_partition_offsets.size()) ||
-                    (adj_matrix_partition_weights.size() == 0),
-                  "Invalid API parameter: adj_matrix_partition_weights.size() should coincide with "
-                  "adj_matrix_partition_offsets.size() (if weighted) or 0 (if unweighted).");
+  CUGRAPH_EXPECTS(
+    !is_weighted || ((*adj_matrix_partition_weights).size() == adj_matrix_partition_offsets.size()),
+    "Internal Error: adj_matrix_partition_weights.size() should coincide with "
+    "adj_matrix_partition_offsets.size() (if weighted).");
+  CUGRAPH_EXPECTS(adj_matrix_partition_dcs_nzd_vertex_counts.has_value() == use_dcs,
+                  "adj_matrix_partition_dcs_nzd_vertices.has_value() and "
+                  "adj_matrix_partition_dcs_nzd_vertex_counts.has_value() should coincide");
+  CUGRAPH_EXPECTS(!use_dcs || ((*adj_matrix_partition_dcs_nzd_vertices).size() ==
+                               (*adj_matrix_partition_dcs_nzd_vertex_counts).size()),
+                  "Internal Error: adj_matrix_partition_dcs_nzd_vertices.size() and "
+                  "adj_matrix_partition_dcs_nzd_vertex_counts.size() should coincide (if used).");
+  CUGRAPH_EXPECTS(!use_dcs || ((*adj_matrix_partition_dcs_nzd_vertices).size() ==
+                               adj_matrix_partition_offsets.size()),
+                  "Internal Error: adj_matrix_partition_dcs_nzd_vertices.size() should coincide "
+                  "with adj_matrix_partition_offsets.size() (if used).");
+
+  CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == static_cast<size_t>(col_comm_size),
+                  "Internal Error: erroneous adj_matrix_partition_offsets.size().");
 
   CUGRAPH_EXPECTS(
-    (partition.is_hypergraph_partitioned() &&
-     (adj_matrix_partition_offsets.size() == static_cast<size_t>(row_comm_size))) ||
-      (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)),
-    "Invalid API parameter: errneous adj_matrix_partition_offsets.size().");
-
-  CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition &&
-                   (vertex_partition_segment_offsets.size() ==
-                    (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size) *
-                      (detail::num_segments_per_vertex_partition + 1))) ||
-                    (!sorted_by_global_degree_within_vertex_partition &&
-                     (vertex_partition_segment_offsets.size() == 0)),
-                  "Invalid API parameter: vertex_partition_segment_offsets.size() does not match "
-                  "with sorted_by_global_degree_within_vertex_partition.");
+    !adj_matrix_partition_segment_offsets.has_value() ||
+      ((*adj_matrix_partition_segment_offsets).size() ==
+       col_comm_size * (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 2 : 1))),
+    "Internal Error: invalid adj_matrix_partition_segment_offsets.size().");
 
   // optional expensive checks
 
   if (do_expensive_check) {
-    auto default_stream = this->get_handle_ptr()->get_stream();
+    auto default_stream_view = this->get_handle_ptr()->get_stream_view();
 
     auto const row_comm_rank = this->get_handle_ptr()
                                  ->get_subcomm(cugraph::partition_2d::key_naming_t().row_name())
@@ -123,73 +247,79 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
 
     edge_t number_of_local_edges_sum{};
     for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
-      vertex_t major_first{};
-      vertex_t major_last{};
-      vertex_t minor_first{};
-      vertex_t minor_last{};
-      std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(i);
-      std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range();
-      CUGRAPH_EXPECTS(
-        thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
-                          adj_matrix_partition_offsets[i],
-                          adj_matrix_partition_offsets[i] + (major_last - major_first + 1)),
-        "Invalid API parameter: adj_matrix_partition_offsets[] is not sorted.");
+      auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
+      auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
+      auto offset_array_size         = major_last - major_first + 1;
+      if (use_dcs) {
+        auto major_hypersparse_first =
+          major_first + (*adj_matrix_partition_segment_offsets)
+                          [(detail::num_sparse_segments_per_vertex_partition + 2) * i +
+                           detail::num_sparse_segments_per_vertex_partition];
+        offset_array_size = major_hypersparse_first - major_first +
+                            (*adj_matrix_partition_dcs_nzd_vertex_counts)[i] + 1;
+      }
+      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view),
+                                        adj_matrix_partition_offsets[i],
+                                        adj_matrix_partition_offsets[i] + offset_array_size),
+                      "Internal Error: adj_matrix_partition_offsets[] is not sorted.");
       edge_t number_of_local_edges{};
       raft::update_host(&number_of_local_edges,
-                        adj_matrix_partition_offsets[i] + (major_last - major_first),
+                        adj_matrix_partition_offsets[i] + offset_array_size - 1,
                         1,
-                        default_stream);
-      CUDA_TRY(cudaStreamSynchronize(default_stream));
+                        default_stream_view.value());
+      default_stream_view.synchronize();
       number_of_local_edges_sum += number_of_local_edges;
 
       // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
       CUGRAPH_EXPECTS(
-        thrust::count_if(rmm::exec_policy(default_stream)->on(default_stream),
+        thrust::count_if(rmm::exec_policy(default_stream_view),
                          adj_matrix_partition_indices[i],
                          adj_matrix_partition_indices[i] + number_of_local_edges,
                          out_of_range_t<vertex_t>{minor_first, minor_last}) == 0,
-        "Invalid API parameter: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
+        "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
     }
     number_of_local_edges_sum = host_scalar_allreduce(
-      this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream);
+      this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream_view.value());
     CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
-                    "Invalid API parameter: the sum of local edges doe counts not match with "
+                    "Internal Error: the sum of local edges counts does not match with "
                     "number_of_local_edges.");
 
-    if (sorted_by_global_degree_within_vertex_partition) {
-      auto degrees = detail::compute_major_degree(handle, adj_matrix_partition_offsets, partition);
-      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
+    if (adj_matrix_partition_segment_offsets) {
+      auto degrees = detail::compute_major_degrees(handle,
+                                                   adj_matrix_partition_offsets,
+                                                   adj_matrix_partition_dcs_nzd_vertices,
+                                                   adj_matrix_partition_dcs_nzd_vertex_counts,
+                                                   partition,
+                                                   adj_matrix_partition_segment_offsets);
+      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view),
                                         degrees.begin(),
                                         degrees.end(),
                                         thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_global_degree_within_vertex_partition is "
-                      "set to true, but degrees are not non-ascending.");
-
-      for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-           ++i) {
-        CUGRAPH_EXPECTS(std::is_sorted(vertex_partition_segment_offsets.begin() +
-                                         (detail::num_segments_per_vertex_partition + 1) * i,
-                                       vertex_partition_segment_offsets.begin() +
-                                         (detail::num_segments_per_vertex_partition + 1) * (i + 1)),
-                        "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
+                      "Invalid Invalid input argument: adj_matrix_partition_segment_offsets are "
+                      "provided, but degrees are not in descending order.");
+
+      auto num_segments_per_vertex_partition =
+        detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 1 : 0);
+      for (int i = 0; i < col_comm_size; ++i) {
+        CUGRAPH_EXPECTS(std::is_sorted((*adj_matrix_partition_segment_offsets).begin() +
+                                         (num_segments_per_vertex_partition + 1) * i,
+                                       (*adj_matrix_partition_segment_offsets).begin() +
+                                         (num_segments_per_vertex_partition + 1) * (i + 1)),
+                        "Internal Error: erroneous adj_matrix_partition_segment_offsets.");
         CUGRAPH_EXPECTS(
-          vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i] ==
-            0,
-          "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
-        auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                      ? row_comm_size * i + row_comm_rank
-                                      : col_comm_rank * row_comm_size + i;
+          (*adj_matrix_partition_segment_offsets)[(num_segments_per_vertex_partition + 1) * i] == 0,
+          "Internal Error: erroneous adj_matrix_partition_segment_offsets.");
+        auto vertex_partition_idx = row_comm_size * i + row_comm_rank;
         CUGRAPH_EXPECTS(
-          vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i +
-                                           detail::num_segments_per_vertex_partition] ==
+          (*adj_matrix_partition_segment_offsets)[(num_segments_per_vertex_partition + 1) * i +
+                                                  num_segments_per_vertex_partition] ==
             partition.get_vertex_partition_size(vertex_partition_idx),
-          "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
+          "Internal Error: erroneous adj_matrix_partition_segment_offsets.");
       }
     }
 
-    CUGRAPH_EXPECTS(
-      partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
-      "Invalid API parameter: vertex partition should cover [0, number_of_vertices).");
+    CUGRAPH_EXPECTS(partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
+                    "Internal Error: vertex partition should cover [0, number_of_vertices).");
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
     if (this->is_symmetric()) {}
@@ -212,13 +342,12 @@ graph_view_t<vertex_t,
              std::enable_if_t<!multi_gpu>>::graph_view_t(raft::handle_t const& handle,
                                                          edge_t const* offsets,
                                                          vertex_t const* indices,
-                                                         weight_t const* weights,
-                                                         std::vector<vertex_t> const&
-                                                           segment_offsets,
+                                                         std::optional<weight_t const*> weights,
                                                          vertex_t number_of_vertices,
                                                          edge_t number_of_edges,
                                                          graph_properties_t properties,
-                                                         bool sorted_by_degree,
+                                                         std::optional<std::vector<vertex_t>> const&
+                                                           segment_offsets,
                                                          bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, number_of_vertices, number_of_edges, properties),
@@ -230,45 +359,42 @@ graph_view_t<vertex_t,
   // cheap error checks
 
   CUGRAPH_EXPECTS(
-    (sorted_by_degree &&
-     (segment_offsets.size() == (detail::num_segments_per_vertex_partition + 1))) ||
-      (!sorted_by_degree && (segment_offsets.size() == 0)),
-    "Invalid API parameter: segment_offsets.size() does not match with sorted_by_degree.");
+    !segment_offsets.has_value() ||
+      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
+    "Internal Error: segment_offsets.size() returns an invalid value.");
 
   // optional expensive checks
 
   if (do_expensive_check) {
-    auto default_stream = this->get_handle_ptr()->get_stream();
+    auto default_stream_view = this->get_handle_ptr()->get_stream_view();
 
-    CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
+    CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view),
                                       offsets,
                                       offsets + (this->get_number_of_vertices() + 1)),
-                    "Invalid API parameter: offsets is not sorted.");
+                    "Internal Error: offsets is not sorted.");
 
     // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
     CUGRAPH_EXPECTS(
-      thrust::count_if(rmm::exec_policy(default_stream)->on(default_stream),
+      thrust::count_if(rmm::exec_policy(default_stream_view),
                        indices,
                        indices + this->get_number_of_edges(),
                        out_of_range_t<vertex_t>{0, this->get_number_of_vertices()}) == 0,
-      "Invalid API parameter: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
-
-    if (sorted_by_degree) {
-      auto degree_first =
-        thrust::make_transform_iterator(thrust::make_counting_iterator(vertex_t{0}),
-                                        detail::degree_from_offsets_t<vertex_t, edge_t>{offsets});
-      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
-                                        degree_first,
-                                        degree_first + this->get_number_of_vertices(),
+      "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
+
+    if (segment_offsets) {
+      auto degrees = detail::compute_major_degrees(handle, offsets, number_of_vertices);
+      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view),
+                                        degrees.begin(),
+                                        degrees.end(),
                                         thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_degree is set to true, but degrees are not "
-                      "non-ascending.");
-
-      CUGRAPH_EXPECTS(std::is_sorted(segment_offsets.begin(), segment_offsets.end()),
-                      "Invalid API parameter: erroneous segment_offsets.");
-      CUGRAPH_EXPECTS(segment_offsets[0] == 0, "Invalid API parameter: segment_offsets.");
-      CUGRAPH_EXPECTS(segment_offsets.back() == this->get_number_of_vertices(),
-                      "Invalid API parameter: segment_offsets.");
+                      "Invalid Invalid input argument: segment_offsets are provided, but degrees "
+                      "are not in descending order.");
+
+      CUGRAPH_EXPECTS(std::is_sorted((*segment_offsets).begin(), (*segment_offsets).end()),
+                      "Internal Error: erroneous segment_offsets.");
+      CUGRAPH_EXPECTS((*segment_offsets)[0] == 0, "Invalid input argument segment_offsets.");
+      CUGRAPH_EXPECTS((*segment_offsets).back() == this->get_number_of_vertices(),
+                      "Invalid input argument: segment_offsets.");
     }
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
@@ -279,6 +405,338 @@ graph_view_t<vertex_t,
   }
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_in_degrees(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return detail::compute_major_degrees(handle,
+                                         this->adj_matrix_partition_offsets_,
+                                         this->adj_matrix_partition_dcs_nzd_vertices_,
+                                         this->adj_matrix_partition_dcs_nzd_vertex_counts_,
+                                         this->partition_,
+                                         this->adj_matrix_partition_segment_offsets_);
+  } else {
+    return compute_minor_degrees(handle, *this);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t,
+             edge_t,
+             weight_t,
+             store_transposed,
+             multi_gpu,
+             std::enable_if_t<!multi_gpu>>::compute_in_degrees(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return detail::compute_major_degrees(
+      handle, this->offsets_, this->get_number_of_local_vertices());
+  } else {
+    return compute_minor_degrees(handle, *this);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_out_degrees(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return compute_minor_degrees(handle, *this);
+  } else {
+    return detail::compute_major_degrees(handle,
+                                         this->adj_matrix_partition_offsets_,
+                                         this->adj_matrix_partition_dcs_nzd_vertices_,
+                                         this->adj_matrix_partition_dcs_nzd_vertex_counts_,
+                                         this->partition_,
+                                         this->adj_matrix_partition_segment_offsets_);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t,
+             edge_t,
+             weight_t,
+             store_transposed,
+             multi_gpu,
+             std::enable_if_t<!multi_gpu>>::compute_out_degrees(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return compute_minor_degrees(handle, *this);
+  } else {
+    return detail::compute_major_degrees(
+      handle, this->offsets_, this->get_number_of_local_vertices());
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<weight_t>
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_in_weight_sums(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return compute_weight_sums<true>(handle, *this);
+  } else {
+    return compute_weight_sums<false>(handle, *this);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<weight_t> graph_view_t<
+  vertex_t,
+  edge_t,
+  weight_t,
+  store_transposed,
+  multi_gpu,
+  std::enable_if_t<!multi_gpu>>::compute_in_weight_sums(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return compute_weight_sums<true>(handle, *this);
+  } else {
+    return compute_weight_sums<false>(handle, *this);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<weight_t>
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_out_weight_sums(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return compute_weight_sums<false>(handle, *this);
+  } else {
+    return compute_weight_sums<true>(handle, *this);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<weight_t> graph_view_t<
+  vertex_t,
+  edge_t,
+  weight_t,
+  store_transposed,
+  multi_gpu,
+  std::enable_if_t<!multi_gpu>>::compute_out_weight_sums(raft::handle_t const& handle) const
+{
+  if (store_transposed) {
+    return compute_weight_sums<false>(handle, *this);
+  } else {
+    return compute_weight_sums<true>(handle, *this);
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_in_degree(raft::handle_t const& handle) const
+{
+  auto in_degrees = compute_in_degrees(handle);
+  auto it         = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), in_degrees.begin(), in_degrees.end());
+  rmm::device_scalar<edge_t> ret(edge_t{0}, handle.get_stream());
+  device_allreduce(handle.get_comms(),
+                   it != in_degrees.end() ? it : ret.data(),
+                   ret.data(),
+                   1,
+                   raft::comms::op_t::MAX,
+                   handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t graph_view_t<vertex_t,
+                    edge_t,
+                    weight_t,
+                    store_transposed,
+                    multi_gpu,
+                    std::enable_if_t<!multi_gpu>>::compute_max_in_degree(raft::handle_t const&
+                                                                           handle) const
+{
+  auto in_degrees = compute_in_degrees(handle);
+  auto it         = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), in_degrees.begin(), in_degrees.end());
+  edge_t ret{0};
+  if (it != in_degrees.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); }
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_out_degree(raft::handle_t const& handle) const
+{
+  auto out_degrees = compute_out_degrees(handle);
+  auto it          = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), out_degrees.begin(), out_degrees.end());
+  rmm::device_scalar<edge_t> ret(edge_t{0}, handle.get_stream());
+  device_allreduce(handle.get_comms(),
+                   it != out_degrees.end() ? it : ret.data(),
+                   ret.data(),
+                   1,
+                   raft::comms::op_t::MAX,
+                   handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t graph_view_t<vertex_t,
+                    edge_t,
+                    weight_t,
+                    store_transposed,
+                    multi_gpu,
+                    std::enable_if_t<!multi_gpu>>::compute_max_out_degree(raft::handle_t const&
+                                                                            handle) const
+{
+  auto out_degrees = compute_out_degrees(handle);
+  auto it          = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), out_degrees.begin(), out_degrees.end());
+  edge_t ret{0};
+  if (it != out_degrees.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); }
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_in_weight_sum(raft::handle_t const& handle) const
+{
+  auto in_weight_sums = compute_in_weight_sums(handle);
+  auto it             = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), in_weight_sums.begin(), in_weight_sums.end());
+  rmm::device_scalar<weight_t> ret(weight_t{0.0}, handle.get_stream());
+  device_allreduce(handle.get_comms(),
+                   it != in_weight_sums.end() ? it : ret.data(),
+                   ret.data(),
+                   1,
+                   raft::comms::op_t::MAX,
+                   handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t graph_view_t<vertex_t,
+                      edge_t,
+                      weight_t,
+                      store_transposed,
+                      multi_gpu,
+                      std::enable_if_t<!multi_gpu>>::compute_max_in_weight_sum(raft::handle_t const&
+                                                                                 handle) const
+{
+  auto in_weight_sums = compute_in_weight_sums(handle);
+  auto it             = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), in_weight_sums.begin(), in_weight_sums.end());
+  weight_t ret{0.0};
+  if (it != in_weight_sums.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); }
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_out_weight_sum(raft::handle_t const& handle) const
+{
+  auto out_weight_sums = compute_out_weight_sums(handle);
+  auto it              = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), out_weight_sums.begin(), out_weight_sums.end());
+  rmm::device_scalar<weight_t> ret(weight_t{0.0}, handle.get_stream());
+  device_allreduce(handle.get_comms(),
+                   it != out_weight_sums.end() ? it : ret.data(),
+                   ret.data(),
+                   1,
+                   raft::comms::op_t::MAX,
+                   handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t graph_view_t<
+  vertex_t,
+  edge_t,
+  weight_t,
+  store_transposed,
+  multi_gpu,
+  std::enable_if_t<!multi_gpu>>::compute_max_out_weight_sum(raft::handle_t const& handle) const
+{
+  auto out_weight_sums = compute_out_weight_sums(handle);
+  auto it              = thrust::max_element(
+    rmm::exec_policy(handle.get_stream_view()), out_weight_sums.begin(), out_weight_sums.end());
+  weight_t ret{0.0};
+  if (it != out_weight_sums.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); }
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
 // explicit instantiation
 
 template class graph_view_t<int32_t, int32_t, float, true, true>;
diff --git a/cpp/src/experimental/induced_subgraph.cu b/cpp/src/experimental/induced_subgraph.cu
new file mode 100644
index 00000000000..c43f81bbf39
--- /dev/null
+++ b/cpp/src/experimental/induced_subgraph.cu
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <tuple>
+
+#include <utilities/high_res_timer.hpp>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           rmm::device_uvector<size_t>>
+extract_induced_subgraphs(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  size_t const* subgraph_offsets /* size == num_subgraphs + 1 */,
+  vertex_t const* subgraph_vertices /* size == subgraph_offsets[num_subgraphs] */,
+  size_t num_subgraphs,
+  bool do_expensive_check)
+{
+#ifdef TIMING
+  HighResTimer hr_timer;
+  hr_timer.start("extract_induced_subgraphs");
+#endif
+  // FIXME: this code is inefficient for the vertices with their local degrees much larger than the
+  // number of vertices in the subgraphs (in this case, searching that the subgraph vertices are
+  // included in the local neighbors is more efficient than searching the local neighbors are
+  // included in the subgraph vertices). We may later add additional code to handle such cases.
+  // FIXME: we may consider the performance (speed & memory footprint, hash based approach uses
+  // extra-memory) of hash table based and binary search based approaches
+
+  // 1. check input arguments
+
+  if (do_expensive_check) {
+    size_t should_be_zero{std::numeric_limits<size_t>::max()};
+    size_t num_aggregate_subgraph_vertices{};
+    raft::update_host(&should_be_zero, subgraph_offsets, 1, handle.get_stream());
+    raft::update_host(
+      &num_aggregate_subgraph_vertices, subgraph_offsets + num_subgraphs, 1, handle.get_stream());
+    handle.get_stream_view().synchronize();
+    CUGRAPH_EXPECTS(should_be_zero == 0,
+                    "Invalid input argument: subgraph_offsets[0] should be 0.");
+
+    CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(handle.get_stream_view()),
+                                      subgraph_offsets,
+                                      subgraph_offsets + (num_subgraphs + 1)),
+                    "Invalid input argument: subgraph_offsets is not sorted.");
+    auto vertex_partition =
+      vertex_partition_device_view_t<vertex_t, multi_gpu>(graph_view.get_vertex_partition_view());
+    CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                                     subgraph_vertices,
+                                     subgraph_vertices + num_aggregate_subgraph_vertices,
+                                     [vertex_partition] __device__(auto v) {
+                                       return !vertex_partition.is_valid_vertex(v) ||
+                                              !vertex_partition.is_local_vertex_nocheck(v);
+                                     }) == 0,
+                    "Invalid input argument: subgraph_vertices has invalid vertex IDs.");
+
+    CUGRAPH_EXPECTS(
+      thrust::count_if(
+        rmm::exec_policy(handle.get_stream_view()),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(num_subgraphs),
+        [subgraph_offsets, subgraph_vertices] __device__(auto i) {
+          // vertices are sorted and unique
+          return !thrust::is_sorted(thrust::seq,
+                                    subgraph_vertices + subgraph_offsets[i],
+                                    subgraph_vertices + subgraph_offsets[i + 1]) ||
+                 (thrust::count_if(
+                    thrust::seq,
+                    thrust::make_counting_iterator(subgraph_offsets[i]),
+                    thrust::make_counting_iterator(subgraph_offsets[i + 1]),
+                    [subgraph_vertices, last = subgraph_offsets[i + 1] - 1] __device__(auto i) {
+                      return (i != last) && (subgraph_vertices[i] == subgraph_vertices[i + 1]);
+                    }) != 0);
+        }) == 0,
+      "Invalid input argument: subgraph_vertices for each subgraph idx should be sorted in "
+      "ascending order and unique.");
+  }
+
+  // 2. extract induced subgraphs
+
+  if (multi_gpu) {
+    CUGRAPH_FAIL("Unimplemented.");
+    return std::make_tuple(rmm::device_uvector<vertex_t>(0, handle.get_stream_view()),
+                           rmm::device_uvector<vertex_t>(0, handle.get_stream_view()),
+                           rmm::device_uvector<weight_t>(0, handle.get_stream_view()),
+                           rmm::device_uvector<size_t>(0, handle.get_stream_view()));
+  } else {
+    // 2-1. Phase 1: calculate memory requirements
+
+    size_t num_aggregate_subgraph_vertices{};
+    raft::update_host(
+      &num_aggregate_subgraph_vertices, subgraph_offsets + num_subgraphs, 1, handle.get_stream());
+    handle.get_stream_view().synchronize();
+
+    rmm::device_uvector<size_t> subgraph_vertex_output_offsets(
+      num_aggregate_subgraph_vertices + 1,
+      handle.get_stream_view());  // for each element of subgraph_vertices
+
+    auto matrix_partition = matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
+      graph_view.get_matrix_partition_view());
+    // count the numbers of the induced subgraph edges for each vertex in the aggregate subgraph
+    // vertex list.
+    thrust::transform(
+      rmm::exec_policy(handle.get_stream_view()),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_aggregate_subgraph_vertices),
+      subgraph_vertex_output_offsets.begin(),
+      [subgraph_offsets, subgraph_vertices, num_subgraphs, matrix_partition] __device__(auto i) {
+        auto subgraph_idx = thrust::distance(
+          subgraph_offsets + 1,
+          thrust::upper_bound(thrust::seq, subgraph_offsets, subgraph_offsets + num_subgraphs, i));
+        vertex_t const* indices{nullptr};
+        thrust::optional<weight_t const*> weights{thrust::nullopt};
+        edge_t local_degree{};
+        auto major_offset =
+          matrix_partition.get_major_offset_from_major_nocheck(subgraph_vertices[i]);
+        thrust::tie(indices, weights, local_degree) =
+          matrix_partition.get_local_edges(major_offset);
+        // FIXME: this is inefficient for high local degree vertices
+        return thrust::count_if(
+          thrust::seq,
+          indices,
+          indices + local_degree,
+          [vertex_first = subgraph_vertices + subgraph_offsets[subgraph_idx],
+           vertex_last =
+             subgraph_vertices + subgraph_offsets[subgraph_idx + 1]] __device__(auto nbr) {
+            return thrust::binary_search(thrust::seq, vertex_first, vertex_last, nbr);
+          });
+      });
+    thrust::exclusive_scan(rmm::exec_policy(handle.get_stream_view()),
+                           subgraph_vertex_output_offsets.begin(),
+                           subgraph_vertex_output_offsets.end(),
+                           subgraph_vertex_output_offsets.begin());
+
+    size_t num_aggregate_edges{};
+    raft::update_host(&num_aggregate_edges,
+                      subgraph_vertex_output_offsets.data() + num_aggregate_subgraph_vertices,
+                      1,
+                      handle.get_stream());
+    handle.get_stream_view().synchronize();
+
+    // 2-2. Phase 2: find the edges in the induced subgraphs
+
+    rmm::device_uvector<vertex_t> edge_majors(num_aggregate_edges, handle.get_stream_view());
+    rmm::device_uvector<vertex_t> edge_minors(num_aggregate_edges, handle.get_stream_view());
+    auto edge_weights = graph_view.is_weighted()
+                          ? std::make_optional<rmm::device_uvector<weight_t>>(
+                              num_aggregate_edges, handle.get_stream_view())
+                          : std::nullopt;
+
+    // fill the edge list buffer (to be returned) for each vetex in the aggregate subgraph vertex
+    // list (use the offsets computed in the Phase 1)
+    thrust::for_each(
+      rmm::exec_policy(handle.get_stream_view()),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_aggregate_subgraph_vertices),
+      [subgraph_offsets,
+       subgraph_vertices,
+       num_subgraphs,
+       matrix_partition,
+       subgraph_vertex_output_offsets = subgraph_vertex_output_offsets.data(),
+       edge_majors                    = edge_majors.data(),
+       edge_minors                    = edge_minors.data(),
+       edge_weights = edge_weights ? thrust::optional<weight_t*>{(*edge_weights).data()}
+                                   : thrust::nullopt] __device__(auto i) {
+        auto subgraph_idx = thrust::distance(
+          subgraph_offsets + 1,
+          thrust::upper_bound(
+            thrust::seq, subgraph_offsets, subgraph_offsets + num_subgraphs, size_t{i}));
+        vertex_t const* indices{nullptr};
+        thrust::optional<weight_t const*> weights{thrust::nullopt};
+        edge_t local_degree{};
+        auto major_offset =
+          matrix_partition.get_major_offset_from_major_nocheck(subgraph_vertices[i]);
+        thrust::tie(indices, weights, local_degree) =
+          matrix_partition.get_local_edges(major_offset);
+        if (weights) {
+          auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
+            thrust::make_constant_iterator(subgraph_vertices[i]), indices, *weights));
+          // FIXME: this is inefficient for high local degree vertices
+          thrust::copy_if(
+            thrust::seq,
+            triplet_first,
+            triplet_first + local_degree,
+            thrust::make_zip_iterator(thrust::make_tuple(edge_majors, edge_minors, *edge_weights)) +
+              subgraph_vertex_output_offsets[i],
+            [vertex_first = subgraph_vertices + subgraph_offsets[subgraph_idx],
+             vertex_last =
+               subgraph_vertices + subgraph_offsets[subgraph_idx + 1]] __device__(auto t) {
+              return thrust::binary_search(
+                thrust::seq, vertex_first, vertex_last, thrust::get<1>(t));
+            });
+        } else {
+          auto pair_first = thrust::make_zip_iterator(
+            thrust::make_tuple(thrust::make_constant_iterator(subgraph_vertices[i]), indices));
+          // FIXME: this is inefficient for high local degree vertices
+          thrust::copy_if(thrust::seq,
+                          pair_first,
+                          pair_first + local_degree,
+                          thrust::make_zip_iterator(thrust::make_tuple(edge_majors, edge_minors)) +
+                            subgraph_vertex_output_offsets[i],
+                          [vertex_first = subgraph_vertices + subgraph_offsets[subgraph_idx],
+                           vertex_last  = subgraph_vertices +
+                                         subgraph_offsets[subgraph_idx + 1]] __device__(auto t) {
+                            return thrust::binary_search(
+                              thrust::seq, vertex_first, vertex_last, thrust::get<1>(t));
+                          });
+        }
+      });
+
+    rmm::device_uvector<size_t> subgraph_edge_offsets(num_subgraphs + 1, handle.get_stream_view());
+    thrust::gather(rmm::exec_policy(handle.get_stream_view()),
+                   subgraph_offsets,
+                   subgraph_offsets + (num_subgraphs + 1),
+                   subgraph_vertex_output_offsets.begin(),
+                   subgraph_edge_offsets.begin());
+#ifdef TIMING
+    hr_timer.stop();
+    hr_timer.display(std::cout);
+#endif
+    return std::make_tuple(std::move(edge_majors),
+                           std::move(edge_minors),
+                           std::move(edge_weights),
+                           std::move(subgraph_edge_offsets));
+  }
+}
+
+// explicit instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int32_t, float, true, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int32_t, double, true, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int64_t, float, true, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int64_t, double, true, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int32_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int64_t, int64_t, float, true, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int64_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int64_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int64_t, int64_t, double, true, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int64_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const& handle,
+                          graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
+                          size_t const* subgraph_offsets,
+                          int64_t const* subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/katz_centrality.cu b/cpp/src/experimental/katz_centrality.cu
index 587011da817..b8ab45277fc 100644
--- a/cpp/src/experimental/katz_centrality.cu
+++ b/cpp/src/experimental/katz_centrality.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <algorithms.hpp>
-#include <experimental/graph_view.hpp>
-#include <patterns/copy_to_adj_matrix_row_col.cuh>
-#include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
-#include <patterns/count_if_v.cuh>
-#include <patterns/transform_reduce_v.cuh>
-#include <utilities/error.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <cugraph/prims/count_if_v.cuh>
+#include <cugraph/prims/transform_reduce_v.cuh>
+#include <cugraph/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
@@ -36,10 +36,10 @@ namespace experimental {
 namespace detail {
 
 template <typename GraphViewType, typename result_t>
-void katz_centrality(raft::handle_t const &handle,
-                     GraphViewType const &pull_graph_view,
-                     result_t *betas,
-                     result_t *katz_centralities,
+void katz_centrality(raft::handle_t const& handle,
+                     GraphViewType const& pull_graph_view,
+                     result_t const* betas,
+                     result_t* katz_centralities,
                      result_t alpha,
                      result_t beta,  // relevant only if betas == nullptr
                      result_t epsilon,
@@ -166,17 +166,15 @@ void katz_centrality(raft::handle_t const &handle,
                       katz_centralities,
                       [l2_norm] __device__(auto val) { return val / l2_norm; });
   }
-
-  return;
 }
 
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
-void katz_centrality(raft::handle_t const &handle,
-                     graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
-                     result_t *betas,
-                     result_t *katz_centralities,
+void katz_centrality(raft::handle_t const& handle,
+                     graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const& graph_view,
+                     result_t const* betas,
+                     result_t* katz_centralities,
                      result_t alpha,
                      result_t beta,  // relevant only if beta == nullptr
                      result_t epsilon,
@@ -200,10 +198,10 @@ void katz_centrality(raft::handle_t const &handle,
 
 // explicit instantiation
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int32_t, float, true, true> const &graph_view,
-                              float *betas,
-                              float *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int32_t, float, true, true> const& graph_view,
+                              float const* betas,
+                              float* katz_centralities,
                               float alpha,
                               float beta,
                               float epsilon,
@@ -212,10 +210,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int32_t, double, true, true> const &graph_view,
-                              double *betas,
-                              double *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int32_t, double, true, true> const& graph_view,
+                              double const* betas,
+                              double* katz_centralities,
                               double alpha,
                               double beta,
                               double epsilon,
@@ -224,10 +222,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int64_t, float, true, true> const &graph_view,
-                              float *betas,
-                              float *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int64_t, float, true, true> const& graph_view,
+                              float const* betas,
+                              float* katz_centralities,
                               float alpha,
                               float beta,
                               float epsilon,
@@ -236,10 +234,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int64_t, double, true, true> const &graph_view,
-                              double *betas,
-                              double *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int64_t, double, true, true> const& graph_view,
+                              double const* betas,
+                              double* katz_centralities,
                               double alpha,
                               double beta,
                               double epsilon,
@@ -248,10 +246,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int64_t, int64_t, float, true, true> const &graph_view,
-                              float *betas,
-                              float *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int64_t, int64_t, float, true, true> const& graph_view,
+                              float const* betas,
+                              float* katz_centralities,
                               float alpha,
                               float beta,
                               float epsilon,
@@ -260,10 +258,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int64_t, int64_t, double, true, true> const &graph_view,
-                              double *betas,
-                              double *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int64_t, int64_t, double, true, true> const& graph_view,
+                              double const* betas,
+                              double* katz_centralities,
                               double alpha,
                               double beta,
                               double epsilon,
@@ -272,10 +270,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int32_t, float, true, false> const &graph_view,
-                              float *betas,
-                              float *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int32_t, float, true, false> const& graph_view,
+                              float const* betas,
+                              float* katz_centralities,
                               float alpha,
                               float beta,
                               float epsilon,
@@ -284,10 +282,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int32_t, double, true, false> const &graph_view,
-                              double *betas,
-                              double *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int32_t, double, true, false> const& graph_view,
+                              double const* betas,
+                              double* katz_centralities,
                               double alpha,
                               double beta,
                               double epsilon,
@@ -296,10 +294,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int64_t, float, true, false> const &graph_view,
-                              float *betas,
-                              float *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int64_t, float, true, false> const& graph_view,
+                              float const* betas,
+                              float* katz_centralities,
                               float alpha,
                               float beta,
                               float epsilon,
@@ -308,10 +306,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int32_t, int64_t, double, true, false> const &graph_view,
-                              double *betas,
-                              double *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int32_t, int64_t, double, true, false> const& graph_view,
+                              double const* betas,
+                              double* katz_centralities,
                               double alpha,
                               double beta,
                               double epsilon,
@@ -320,10 +318,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int64_t, int64_t, float, true, false> const &graph_view,
-                              float *betas,
-                              float *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int64_t, int64_t, float, true, false> const& graph_view,
+                              float const* betas,
+                              float* katz_centralities,
                               float alpha,
                               float beta,
                               float epsilon,
@@ -332,10 +330,10 @@ template void katz_centrality(raft::handle_t const &handle,
                               bool normalize,
                               bool do_expensive_check);
 
-template void katz_centrality(raft::handle_t const &handle,
-                              graph_view_t<int64_t, int64_t, double, true, false> const &graph_view,
-                              double *betas,
-                              double *katz_centralities,
+template void katz_centrality(raft::handle_t const& handle,
+                              graph_view_t<int64_t, int64_t, double, true, false> const& graph_view,
+                              double const* betas,
+                              double* katz_centralities,
                               double alpha,
                               double beta,
                               double epsilon,
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 1f6f8633bcd..e3b5bf91ccc 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,23 +15,21 @@
  */
 #pragma once
 
-#include <thrust/binary_search.h>
-
-#include <experimental/graph.hpp>
+#include <cugraph/dendrogram.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <compute_partition.cuh>
-#include <cuco/static_map.cuh>
-#include <experimental/shuffle.cuh>
-#include <utilities/comm_utils.cuh>
-#include <utilities/graph_utils.cuh>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
 
-#include <raft/device_atomics.cuh>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh>
+#include <cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh>
+#include <cugraph/prims/transform_reduce_e.cuh>
+#include <cugraph/prims/transform_reduce_v.cuh>
+#include <cugraph/utilities/collect_comm.cuh>
 
-#include <patterns/copy_to_adj_matrix_row_col.cuh>
-#include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
-#include <patterns/transform_reduce_e.cuh>
-#include <patterns/transform_reduce_v.cuh>
+#include <thrust/binary_search.h>
+#include <thrust/transform_reduce.h>
 
 //#define TIMING
 
@@ -42,341 +40,6 @@
 namespace cugraph {
 namespace experimental {
 
-namespace detail {
-
-template <typename data_t>
-struct create_cuco_pair_t {
-  cuco::pair_type<data_t, data_t> __device__ operator()(data_t data)
-  {
-    cuco::pair_type<data_t, data_t> tmp;
-    tmp.first  = data;
-    tmp.second = data_t{0};
-    return tmp;
-  }
-};
-
-//
-// These classes should allow cuco::static_map to generate hash tables of
-// different configurations.
-//
-
-//
-//  Compare edges based on src[e] and dst[e] matching
-//
-template <typename data_t, typename sentinel_t>
-class src_dst_equality_comparator_t {
- public:
-  src_dst_equality_comparator_t(rmm::device_vector<data_t> const &src,
-                                rmm::device_vector<data_t> const &dst,
-                                sentinel_t sentinel_value)
-    : d_src_{src.data().get()}, d_dst_{dst.data().get()}, sentinel_value_(sentinel_value)
-  {
-  }
-
-  src_dst_equality_comparator_t(data_t const *d_src, data_t const *d_dst, sentinel_t sentinel_value)
-    : d_src_{d_src}, d_dst_{d_dst}, sentinel_value_(sentinel_value)
-  {
-  }
-
-  template <typename idx_type>
-  __device__ bool operator()(idx_type lhs_index, idx_type rhs_index) const noexcept
-  {
-    return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) &&
-           (d_src_[lhs_index] == d_src_[rhs_index]) && (d_dst_[lhs_index] == d_dst_[rhs_index]);
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-  sentinel_t sentinel_value_;
-};
-
-//
-//  Hash edges based src[e] and dst[e]
-//
-template <typename data_t>
-class src_dst_hasher_t {
- public:
-  src_dst_hasher_t(rmm::device_vector<data_t> const &src, rmm::device_vector<data_t> const &dst)
-    : d_src_{src.data().get()}, d_dst_{dst.data().get()}
-  {
-  }
-
-  src_dst_hasher_t(data_t const *d_src, data_t const *d_dst) : d_src_{d_src}, d_dst_{d_dst} {}
-
-  template <typename idx_type>
-  __device__ auto operator()(idx_type index) const
-  {
-    cuco::detail::MurmurHash3_32<data_t> hasher;
-
-    auto h_src = hasher(d_src_[index]);
-    auto h_dst = hasher(d_dst_[index]);
-
-    /*
-     * Combine the source hash and the dest hash into a single hash value
-     *
-     * Taken from the Boost hash_combine function
-     * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
-     */
-    h_src ^= h_dst + 0x9e3779b9 + (h_src << 6) + (h_src >> 2);
-
-    return h_src;
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-};
-
-//
-//  Compare edges based on src[e] and cluster[dst[e]] matching
-//
-template <typename data_t, typename sentinel_t>
-class src_cluster_equality_comparator_t {
- public:
-  src_cluster_equality_comparator_t(rmm::device_vector<data_t> const &src,
-                                    rmm::device_vector<data_t> const &dst,
-                                    rmm::device_vector<data_t> const &dst_cluster_cache,
-                                    data_t base_dst_id,
-                                    sentinel_t sentinel_value)
-    : d_src_{src.data().get()},
-      d_dst_{dst.data().get()},
-      d_dst_cluster_{dst_cluster_cache.data().get()},
-      base_dst_id_(base_dst_id),
-      sentinel_value_(sentinel_value)
-  {
-  }
-
-  src_cluster_equality_comparator_t(data_t const *d_src,
-                                    data_t const *d_dst,
-                                    data_t const *d_dst_cluster_cache,
-                                    data_t base_dst_id,
-                                    sentinel_t sentinel_value)
-    : d_src_{d_src},
-      d_dst_{d_dst},
-      d_dst_cluster_{d_dst_cluster_cache},
-      base_dst_id_(base_dst_id),
-      sentinel_value_(sentinel_value)
-  {
-  }
-
-  __device__ bool operator()(sentinel_t lhs_index, sentinel_t rhs_index) const noexcept
-  {
-    return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) &&
-           (d_src_[lhs_index] == d_src_[rhs_index]) &&
-           (d_dst_cluster_[d_dst_[lhs_index] - base_dst_id_] ==
-            d_dst_cluster_[d_dst_[rhs_index] - base_dst_id_]);
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-  data_t const *d_dst_cluster_;
-  data_t base_dst_id_;
-  sentinel_t sentinel_value_;
-};
-
-//
-//  Hash edges based src[e] and cluster[dst[e]]
-//
-template <typename data_t>
-class src_cluster_hasher_t {
- public:
-  src_cluster_hasher_t(rmm::device_vector<data_t> const &src,
-                       rmm::device_vector<data_t> const &dst,
-                       rmm::device_vector<data_t> const &dst_cluster_cache,
-                       data_t base_dst_id)
-    : d_src_{src.data().get()},
-      d_dst_{dst.data().get()},
-      d_dst_cluster_{dst_cluster_cache.data().get()},
-      base_dst_id_(base_dst_id)
-  {
-  }
-
-  src_cluster_hasher_t(data_t const *d_src,
-                       data_t const *d_dst,
-                       data_t const *d_dst_cluster_cache,
-                       data_t base_dst_id)
-    : d_src_{d_src}, d_dst_{d_dst}, d_dst_cluster_{d_dst_cluster_cache}, base_dst_id_(base_dst_id)
-  {
-  }
-
-  template <typename idx_type>
-  __device__ auto operator()(idx_type index) const
-  {
-    cuco::detail::MurmurHash3_32<data_t> hasher;
-
-    auto h_src     = hasher(d_src_[index]);
-    auto h_cluster = hasher(d_dst_cluster_[d_dst_[index] - base_dst_id_]);
-
-    /*
-     * Combine the source hash and the cluster hash into a single hash value
-     *
-     * Taken from the Boost hash_combine function
-     * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
-     */
-    h_src ^= h_cluster + 0x9e3779b9 + (h_src << 6) + (h_src >> 2);
-
-    return h_src;
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-  data_t const *d_dst_cluster_;
-  data_t base_dst_id_;
-};
-
-//
-// Skip edges where src[e] == dst[e]
-//
-template <typename data_t>
-class skip_edge_t {
- public:
-  skip_edge_t(rmm::device_vector<data_t> const &src, rmm::device_vector<data_t> const &dst)
-    : d_src_{src.data().get()}, d_dst_{dst.data().get()}
-  {
-  }
-
-  skip_edge_t(data_t const *src, data_t const *dst) : d_src_{src}, d_dst_{dst} {}
-
-  template <typename idx_type>
-  __device__ auto operator()(idx_type index) const
-  {
-    return d_src_[index] == d_dst_[index];
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-};
-
-template <typename vertex_t, typename data_t>
-struct lookup_by_vertex_id {
- public:
-  lookup_by_vertex_id(data_t const *d_array, vertex_t const *d_vertices, vertex_t base_vertex_id)
-    : d_array_(d_array), d_vertices_(d_vertices), base_vertex_id_(base_vertex_id)
-  {
-  }
-
-  template <typename edge_t>
-  data_t operator() __device__(edge_t edge_id) const
-  {
-    return d_array_[d_vertices_[edge_id] - base_vertex_id_];
-  }
-
- private:
-  data_t const *d_array_;
-  vertex_t const *d_vertices_;
-  vertex_t base_vertex_id_;
-};
-
-template <typename vector_t, typename iterator_t, typename function_t>
-vector_t remove_elements_from_vector(vector_t const &input_v,
-                                     iterator_t iterator_begin,
-                                     iterator_t iterator_end,
-                                     function_t function,
-                                     cudaStream_t stream)
-{
-  vector_t temp_v(input_v.size());
-
-  auto last = thrust::copy_if(
-    rmm::exec_policy(stream)->on(stream), iterator_begin, iterator_end, temp_v.begin(), function);
-
-  temp_v.resize(thrust::distance(temp_v.begin(), last));
-
-  return temp_v;
-}
-
-template <typename vector_t, typename function_t>
-vector_t remove_elements_from_vector(vector_t const &input_v,
-                                     function_t function,
-                                     cudaStream_t stream)
-{
-  return remove_elements_from_vector(input_v, input_v.begin(), input_v.end(), function, stream);
-}
-
-// FIXME:  This should be a generic utility.  The one in cython.cu
-//         is very close to this
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool transposed,
-          bool multi_gpu,
-          typename view_t,
-          std::enable_if_t<multi_gpu> * = nullptr>
-std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
-create_graph(raft::handle_t const &handle,
-             rmm::device_vector<vertex_t> const &src_v,
-             rmm::device_vector<vertex_t> const &dst_v,
-             rmm::device_vector<weight_t> const &weight_v,
-             std::size_t num_local_verts,
-             experimental::graph_properties_t graph_props,
-             view_t const &view)
-{
-  std::vector<experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelist(
-    {{src_v.data().get(),
-      dst_v.data().get(),
-      weight_v.data().get(),
-      static_cast<edge_t>(src_v.size())}});
-
-  return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
-    handle,
-    edgelist,
-    view.get_partition(),
-    num_local_verts,
-    src_v.size(),
-    graph_props,
-    false,
-    false);
-}
-
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool transposed,
-          bool multi_gpu,
-          typename view_t,
-          std::enable_if_t<!multi_gpu> * = nullptr>
-std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
-create_graph(raft::handle_t const &handle,
-             rmm::device_vector<vertex_t> const &src_v,
-             rmm::device_vector<vertex_t> const &dst_v,
-             rmm::device_vector<weight_t> const &weight_v,
-             std::size_t num_local_verts,
-             experimental::graph_properties_t graph_props,
-             view_t const &view)
-{
-  experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
-    src_v.data().get(),
-    dst_v.data().get(),
-    weight_v.data().get(),
-    static_cast<vertex_t>(src_v.size())};
-
-  return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
-    handle, edgelist, num_local_verts, graph_props, false, false);
-}
-
-}  // namespace detail
-
-//
-// FIXME:  Ultimately, this would be cleaner and more efficient if we did the following:
-//
-//   1) Create an object that does a single level Louvain computation on an input graph
-//      (no graph contraction)
-//   2) Create an object that does graph contraction
-//   3) Create Louvain to use these objects in sequence to compute the aggregate result.
-//
-//  In MNMG-world, the graph contraction step is going to create another graph that likely
-//  fits efficiently in a smaller number of GPUs (eventually one).  Decomposing the algorithm
-//  as above would allow us to eventually run the single GPU version of single level Louvain
-//  on the contracted graphs - which should be more efficient.
-//
-// FIXME: We should return the dendogram and let the python layer clean it up (or have a
-//  separate C++ function to flatten the dendogram).  There are customers that might
-//  like the dendogram and the implementation would be a bit cleaner if we did the
-//  collapsing as a separate step
-//
 template <typename graph_view_type>
 class Louvain {
  public:
@@ -390,82 +53,47 @@ class Louvain {
                                         graph_view_t::is_adj_matrix_transposed,
                                         graph_view_t::is_multi_gpu>;
 
-  Louvain(raft::handle_t const &handle, graph_view_t const &graph_view)
+  Louvain(raft::handle_t const& handle, graph_view_t const& graph_view)
     :
 #ifdef TIMING
       hr_timer_(),
 #endif
       handle_(handle),
+      dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
       current_graph_view_(graph_view),
-      compute_partition_(graph_view),
-      local_num_vertices_(graph_view.get_number_of_local_vertices()),
-      local_num_rows_(graph_view.get_number_of_local_adj_matrix_partition_rows()),
-      local_num_cols_(graph_view.get_number_of_local_adj_matrix_partition_cols()),
-      local_num_edges_(graph_view.get_number_of_edges()),
-      vertex_weights_v_(graph_view.get_number_of_local_vertices()),
-      cluster_weights_v_(graph_view.get_number_of_local_vertices()),
-      cluster_v_(graph_view.get_number_of_local_vertices()),
-      number_of_vertices_(graph_view.get_number_of_local_vertices()),
-      stream_(handle.get_stream())
+      cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()),
+      cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()),
+      vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()),
+      src_vertex_weights_cache_v_(0, handle.get_stream_view()),
+      src_cluster_cache_v_(0, handle.get_stream_view()),
+      dst_cluster_cache_v_(0, handle.get_stream_view())
   {
-    if (graph_view_t::is_multi_gpu) {
-      rank_               = handle.get_comms().get_rank();
-      base_vertex_id_     = graph_view.get_local_vertex_first();
-      base_src_vertex_id_ = graph_view.get_local_adj_matrix_partition_row_first(0);
-      base_dst_vertex_id_ = graph_view.get_local_adj_matrix_partition_col_first(0);
-
-      raft::copy(&local_num_edges_,
-                 graph_view.offsets() + graph_view.get_local_adj_matrix_partition_row_last(0) -
-                   graph_view.get_local_adj_matrix_partition_row_first(0),
-                 1,
-                 stream_);
-
-      CUDA_TRY(cudaStreamSynchronize(stream_));
-    }
+  }
 
-    src_indices_v_.resize(local_num_edges_);
+  Dendrogram<vertex_t> const& get_dendrogram() const { return *dendrogram_; }
 
-    cugraph::detail::offsets_to_indices(
-      current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get());
+  Dendrogram<vertex_t>& get_dendrogram() { return *dendrogram_; }
 
-    if (base_src_vertex_id_ > 0) {
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        src_indices_v_.begin(),
-                        src_indices_v_.end(),
-                        thrust::make_constant_iterator(base_src_vertex_id_),
-                        src_indices_v_.begin(),
-                        thrust::plus<vertex_t>());
-    }
-  }
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return std::move(dendrogram_); }
 
-  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                                 size_t max_level,
-                                                 weight_t resolution)
+  virtual weight_t operator()(size_t max_level, weight_t resolution)
   {
-    size_t num_level{0};
+    weight_t best_modularity = weight_t{-1};
 
-    weight_t total_edge_weight;
-    total_edge_weight = experimental::transform_reduce_e(
+    weight_t total_edge_weight = experimental::transform_reduce_e(
       handle_,
       current_graph_view_,
       thrust::make_constant_iterator(0),
       thrust::make_constant_iterator(0),
-      [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; },
+      [] __device__(auto src, auto dst, weight_t wt, auto, auto) { return wt; },
       weight_t{0});
 
-    weight_t best_modularity = weight_t{-1};
+    while (dendrogram_->num_levels() < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      initialize_dendrogram_level(current_graph_view_.get_number_of_local_vertices());
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
-                     cluster_v_.begin(),
-                     cluster_v_.end(),
-                     base_vertex_id_);
-    thrust::copy(
-      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
-
-    while (num_level < max_level) {
       compute_vertex_and_cluster_weights();
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution);
@@ -474,56 +102,85 @@ class Louvain {
 
       best_modularity = new_Q;
 
-      shrink_graph(d_cluster_vec);
-
-      num_level++;
+      shrink_graph();
     }
 
     timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  protected:
-  void timer_start(std::string const &region)
+  void timer_start(std::string const& region)
   {
 #ifdef TIMING
-    if (rank_ == 0) hr_timer_.start(region);
+    if (graph_view_t::is_multi_gpu) {
+      if (handle.get_comms().get_rank() == 0) hr_timer_.start(region);
+    } else {
+      hr_timer_.start(region);
+    }
 #endif
   }
 
-  void timer_stop(cudaStream_t stream)
+  void timer_stop(rmm::cuda_stream_view stream_view)
   {
 #ifdef TIMING
-    if (rank_ == 0) {
-      CUDA_TRY(cudaStreamSynchronize(stream));
+    if (graph_view_t::is_multi_gpu) {
+      if (handle.get_comms().get_rank() == 0) {
+        stream_view.synchronize();
+        hr_timer_.stop();
+      }
+    } else {
+      stream_view.synchronize();
       hr_timer_.stop();
     }
 #endif
   }
 
-  void timer_display(std::ostream &os)
+  void timer_display(std::ostream& os)
   {
 #ifdef TIMING
-    if (rank_ == 0) hr_timer_.display(os);
+    if (graph_view_t::is_multi_gpu) {
+      if (handle.get_comms().get_rank() == 0) hr_timer_.display(os);
+    } else {
+      hr_timer_.display(os);
+    }
 #endif
   }
 
+ protected:
+  void initialize_dendrogram_level(vertex_t num_vertices)
+  {
+    dendrogram_->add_level(
+      current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream_view());
+
+    thrust::sequence(rmm::exec_policy(handle_.get_stream_view()),
+                     dendrogram_->current_level_begin(),
+                     dendrogram_->current_level_end(),
+                     current_graph_view_.get_local_vertex_first());
+  }
+
  public:
   weight_t modularity(weight_t total_edge_weight, weight_t resolution)
   {
-    weight_t sum_degree_squared = experimental::transform_reduce_v(
-      handle_,
-      current_graph_view_,
+    weight_t sum_degree_squared = thrust::transform_reduce(
+      rmm::exec_policy(handle_.get_stream_view()),
       cluster_weights_v_.begin(),
+      cluster_weights_v_.end(),
       [] __device__(weight_t p) { return p * p; },
-      weight_t{0});
+      weight_t{0},
+      thrust::plus<weight_t>());
+
+    if (graph_t::is_multi_gpu) {
+      sum_degree_squared =
+        host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream());
+    }
 
     weight_t sum_internal = experimental::transform_reduce_e(
       handle_,
       current_graph_view_,
-      src_cluster_cache_v_.begin(),
-      dst_cluster_cache_v_.begin(),
+      d_src_cluster_cache_,
+      d_dst_cluster_cache_,
       [] __device__(auto src, auto dst, weight_t wt, auto src_cluster, auto nbr_cluster) {
         if (src_cluster == nbr_cluster) {
           return wt;
@@ -543,53 +200,71 @@ class Louvain {
   {
     timer_start("compute_vertex_and_cluster_weights");
 
-    experimental::copy_v_transform_reduce_out_nbr(
-      handle_,
-      current_graph_view_,
-      thrust::make_constant_iterator(0),
-      thrust::make_constant_iterator(0),
-      [] __device__(auto src, auto, auto wt, auto, auto) { return wt; },
-      weight_t{0},
-      vertex_weights_v_.begin());
+    vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_);
+    cluster_keys_v_.resize(vertex_weights_v_.size(), handle_.get_stream_view());
+    cluster_weights_v_.resize(vertex_weights_v_.size(), handle_.get_stream_view());
 
-    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
-                 vertex_weights_v_.begin(),
-                 vertex_weights_v_.end(),
-                 cluster_weights_v_.begin());
+    thrust::sequence(rmm::exec_policy(handle_.get_stream_view()),
+                     cluster_keys_v_.begin(),
+                     cluster_keys_v_.end(),
+                     current_graph_view_.get_local_vertex_first());
 
-    cache_vertex_properties(
-      vertex_weights_v_, src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_);
+    raft::copy(cluster_weights_v_.begin(),
+               vertex_weights_v_.begin(),
+               vertex_weights_v_.size(),
+               handle_.get_stream());
 
-    cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+    d_src_vertex_weights_cache_ =
+      cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_);
 
-    timer_stop(stream_);
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_size = handle_.get_comms().get_size();
+      rmm::device_uvector<vertex_t> rx_keys_v(0, handle_.get_stream_view());
+      rmm::device_uvector<weight_t> rx_weights_v(0, handle_.get_stream_view());
+
+      auto pair_first = thrust::make_zip_iterator(
+        thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin()));
+
+      std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) =
+        groupby_gpuid_and_shuffle_values(
+          handle_.get_comms(),
+          pair_first,
+          pair_first + current_graph_view_.get_number_of_local_vertices(),
+          [key_func =
+             cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{
+               comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); },
+          handle_.get_stream_view());
+
+      cluster_keys_v_    = std::move(rx_keys_v);
+      cluster_weights_v_ = std::move(rx_weights_v);
+    }
+
+    timer_stop(handle_.get_stream_view());
   }
 
-  //
-  // FIXME:  Consider returning d_src_cache and d_dst_cache
-  //         (as a pair).  This would be a nice optimization
-  //         for single GPU, as we wouldn't need to make 3 copies
-  //         of the data, could return a pair of device pointers to
-  //         local_input_v.
-  //
   template <typename T>
-  void cache_vertex_properties(rmm::device_vector<T> const &local_input_v,
-                               rmm::device_vector<T> &src_cache_v,
-                               rmm::device_vector<T> &dst_cache_v,
-                               bool src = true,
-                               bool dst = true)
+  T* cache_src_vertex_properties(rmm::device_uvector<T>& input, rmm::device_uvector<T>& src_cache_v)
   {
-    if (src) {
-      src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows());
-      copy_to_adj_matrix_row(
-        handle_, current_graph_view_, local_input_v.begin(), src_cache_v.begin());
+    if (graph_view_t::is_multi_gpu) {
+      src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(),
+                         handle_.get_stream_view());
+      copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin());
+      return src_cache_v.begin();
+    } else {
+      return input.begin();
     }
+  }
 
-    if (dst) {
-      dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols());
-      copy_to_adj_matrix_col(
-        handle_, current_graph_view_, local_input_v.begin(), dst_cache_v.begin());
+  template <typename T>
+  T* cache_dst_vertex_properties(rmm::device_uvector<T>& input, rmm::device_uvector<T>& dst_cache_v)
+  {
+    if (graph_view_t::is_multi_gpu) {
+      dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(),
+                         handle_.get_stream_view());
+      copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin());
+      return dst_cache_v.begin();
+    } else {
+      return input.begin();
     }
   }
 
@@ -597,9 +272,16 @@ class Louvain {
   {
     timer_start("update_clustering");
 
-    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
+    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(),
+                                                 handle_.get_stream_view());
+
+    raft::copy(next_cluster_v.begin(),
+               dendrogram_->current_level_begin(),
+               dendrogram_->current_level_size(),
+               handle_.get_stream());
 
-    cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+    d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_);
+    d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_);
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
     weight_t cur_Q = new_Q - 1;
@@ -616,1102 +298,267 @@ class Louvain {
 
       up_down = !up_down;
 
-      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
-
       new_Q = modularity(total_edge_weight, resolution);
 
       if (new_Q > cur_Q) {
-        thrust::copy(rmm::exec_policy(stream_)->on(stream_),
-                     next_cluster_v.begin(),
-                     next_cluster_v.end(),
-                     cluster_v_.begin());
+        raft::copy(dendrogram_->current_level_begin(),
+                   next_cluster_v.begin(),
+                   next_cluster_v.size(),
+                   handle_.get_stream());
       }
     }
 
-    // cache the final clustering locally on each cpu
-    cache_vertex_properties(cluster_v_, src_cluster_cache_v_, dst_cluster_cache_v_);
-
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream_view());
     return cur_Q;
   }
 
-  void update_by_delta_modularity(weight_t total_edge_weight,
-                                  weight_t resolution,
-                                  rmm::device_vector<vertex_t> &next_cluster_v,
-                                  bool up_down)
+  void compute_cluster_sum_and_subtract(rmm::device_uvector<weight_t>& old_cluster_sum_v,
+                                        rmm::device_uvector<weight_t>& cluster_subtract_v)
   {
-    rmm::device_vector<weight_t> old_cluster_sum_v(local_num_vertices_);
-    rmm::device_vector<weight_t> src_old_cluster_sum_cache_v;
+    auto output_buffer =
+      cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<weight_t, weight_t>>(
+        current_graph_view_.get_number_of_local_vertices(), handle_.get_stream_view());
 
     experimental::copy_v_transform_reduce_out_nbr(
       handle_,
       current_graph_view_,
-      src_cluster_cache_v_.begin(),
-      dst_cluster_cache_v_.begin(),
+      d_src_cluster_cache_,
+      d_dst_cluster_cache_,
       [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
-        if ((src != dst) && (src_cluster == nbr_cluster)) {
-          return wt;
-        } else
-          return weight_t{0};
-      },
-      weight_t{0},
-      old_cluster_sum_v.begin());
-
-    cache_vertex_properties(
-      old_cluster_sum_v, src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false);
-
-    detail::src_cluster_equality_comparator_t<vertex_t, edge_t> compare(
-      src_indices_v_.data().get(),
-      current_graph_view_.indices(),
-      dst_cluster_cache_v_.data().get(),
-      base_dst_vertex_id_,
-      std::numeric_limits<edge_t>::max());
-    detail::src_cluster_hasher_t<vertex_t> hasher(src_indices_v_.data().get(),
-                                                  current_graph_view_.indices(),
-                                                  dst_cluster_cache_v_.data().get(),
-                                                  base_dst_vertex_id_);
-    detail::skip_edge_t<vertex_t> skip_edge(src_indices_v_.data().get(),
-                                            current_graph_view_.indices());
-
-    //
-    //  Group edges that lead from same source to same neighboring cluster together
-    //  local_cluster_edge_ids_v will contain edge ids of unique pairs of (src,nbr_cluster).
-    //  If multiple edges exist, one edge id will be chosen (by a parallel race).
-    //  nbr_weights_v will contain the combined weight of all of the edges that connect
-    //  that pair.
-    //
-    rmm::device_vector<edge_t> local_cluster_edge_ids_v;
-    rmm::device_vector<weight_t> nbr_weights_v;
-
-    //
-    //  Perform this combining on the local edges
-    //
-    std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights(
-      hasher, compare, skip_edge, current_graph_view_.weights(), local_num_edges_);
-
-    //
-    //  In order to compute delta_Q for a given src/nbr_cluster pair, I need the following
-    //  information:
-    //       src
-    //       old_cluster - the cluster that src is currently assigned to
-    //       nbr_cluster
-    //       sum of edges going to new cluster
-    //       vertex weight of the src vertex
-    //       sum of edges going to old cluster
-    //       cluster_weights of old cluster
-    //       cluster_weights of nbr_cluster
-    //
-    //  Each GPU has locally cached:
-    //       The sum of edges going to the old cluster (computed from
-    //           experimental::copy_v_transform_reduce_out_nbr call above.
-    //       old_cluster
-    //       nbr_cluster
-    //       vertex weight of src vertex
-    //       partial sum of edges going to the new cluster (in nbr_weights)
-    //
-    //  So the plan is to take the tuple:
-    //      (src, old_cluster, src_vertex_weight, old_cluster_sum, nbr_cluster, nbr_weights)
-    //  and shuffle it around the cluster so that they arrive at the GPU where the pair
-    //  (old_cluster, new_cluster) would be assigned.  Then we can aggregate this information
-    //  and compute the delta_Q values.
-    //
-
-    //
-    //  Define the communication pattern, we're going to send detail
-    //  for edge i to the GPU that is responsible for the vertex
-    //  pair (cluster[src[i]], cluster[dst[i]])
-    //
-    auto communication_schedule = thrust::make_transform_iterator(
-      local_cluster_edge_ids_v.begin(),
-      [d_edge_device_view = compute_partition_.edge_device_view(),
-       d_src_indices      = src_indices_v_.data().get(),
-       d_src_cluster      = src_cluster_cache_v_.data().get(),
-       d_dst_indices      = current_graph_view_.indices(),
-       d_dst_cluster      = dst_cluster_cache_v_.data().get(),
-       base_src_vertex_id = base_src_vertex_id_,
-       base_dst_vertex_id = base_dst_vertex_id_] __device__(edge_t edge_id) {
-        return d_edge_device_view(d_src_cluster[d_src_indices[edge_id] - base_src_vertex_id],
-                                  d_dst_cluster[d_dst_indices[edge_id] - base_dst_vertex_id]);
-      });
+        weight_t subtract{0};
+        weight_t sum{0};
 
-    // FIXME:  This should really be a variable_shuffle of a tuple, for time
-    //         reasons I'm just doing 6 independent shuffles.
-    //
-    rmm::device_vector<weight_t> ocs_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_transform_iterator(
-        local_cluster_edge_ids_v.begin(),
-        detail::lookup_by_vertex_id<vertex_t, weight_t>(src_old_cluster_sum_cache_v.data().get(),
-                                                        src_indices_v_.data().get(),
-                                                        base_src_vertex_id_)),
-      communication_schedule);
-
-    rmm::device_vector<vertex_t> src_cluster_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_,
-        local_cluster_edge_ids_v.size(),
-        thrust::make_transform_iterator(
-          local_cluster_edge_ids_v.begin(),
-          detail::lookup_by_vertex_id<vertex_t, vertex_t>(
-            src_cluster_cache_v_.data().get(), src_indices_v_.data().get(), base_src_vertex_id_)),
-        communication_schedule);
-
-    rmm::device_vector<weight_t> src_vertex_weight_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-        handle_,
-        local_cluster_edge_ids_v.size(),
-        thrust::make_transform_iterator(
-          local_cluster_edge_ids_v.begin(),
-          detail::lookup_by_vertex_id<vertex_t, weight_t>(src_vertex_weights_cache_v_.data().get(),
-                                                          src_indices_v_.data().get(),
-                                                          base_src_vertex_id_)),
-        communication_schedule);
-
-    rmm::device_vector<vertex_t> src_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(src_indices_v_.begin(), local_cluster_edge_ids_v.begin()),
-      communication_schedule);
+        if (src == dst)
+          subtract = wt;
+        else if (src_cluster == nbr_cluster)
+          sum = wt;
 
-    rmm::device_vector<vertex_t> nbr_cluster_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_,
-        local_cluster_edge_ids_v.size(),
-        thrust::make_transform_iterator(
-          local_cluster_edge_ids_v.begin(),
-          detail::lookup_by_vertex_id<vertex_t, vertex_t>(
-            dst_cluster_cache_v_.data().get(), current_graph_view_.indices(), base_dst_vertex_id_)),
-        communication_schedule);
-
-    nbr_weights_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_, nbr_weights_v.size(), nbr_weights_v.begin(), communication_schedule);
-
-    //
-    //  At this point, src_v, nbr_cluster_v and nbr_weights_v have been
-    //  shuffled to the correct GPU.  We can now compute the final
-    //  value of delta_Q for each neigboring cluster
-    //
-    //  Again, we'll combine edges that connect the same source to the same
-    //  neighboring cluster and sum their weights.
-    //
-    detail::src_dst_equality_comparator_t<vertex_t, vertex_t> compare2(
-      src_v, nbr_cluster_v, std::numeric_limits<vertex_t>::max());
-    detail::src_dst_hasher_t<vertex_t> hasher2(src_v, nbr_cluster_v);
-
-    auto skip_edge2 = [] __device__(auto) { return false; };
-
-    std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights(
-      hasher2, compare2, skip_edge2, nbr_weights_v.data().get(), src_v.size());
-
-    //
-    //  Now local_cluster_edge_ids_v contains the edge ids of the src id/dest
-    //  cluster id pairs, and nbr_weights_v contains the weight of edges
-    //  going to that cluster id
-    //
-    //  Now we can compute (locally) each delta_Q value
-    //
-    auto iter = thrust::make_zip_iterator(
-      thrust::make_tuple(local_cluster_edge_ids_v.begin(), nbr_weights_v.begin()));
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      iter,
-                      iter + local_cluster_edge_ids_v.size(),
-                      nbr_weights_v.begin(),
-                      [total_edge_weight,
-                       resolution,
-                       d_src                 = src_v.data().get(),
-                       d_src_cluster         = src_cluster_v.data().get(),
-                       d_nbr_cluster         = nbr_cluster_v.data().get(),
-                       d_src_vertex_weights  = src_vertex_weight_v.data().get(),
-                       d_src_cluster_weights = src_cluster_weights_cache_v_.data().get(),
-                       d_dst_cluster_weights = dst_cluster_weights_cache_v_.data().get(),
-                       d_ocs                 = ocs_v.data().get(),
-                       base_src_vertex_id    = base_src_vertex_id_,
-                       base_dst_vertex_id    = base_dst_vertex_id_] __device__(auto tuple) {
-                        edge_t edge_id           = thrust::get<0>(tuple);
-                        vertex_t nbr_cluster     = d_nbr_cluster[edge_id];
-                        weight_t new_cluster_sum = thrust::get<1>(tuple);
-                        vertex_t old_cluster     = d_src_cluster[edge_id];
-                        weight_t k_k             = d_src_vertex_weights[edge_id];
-                        weight_t old_cluster_sum = d_ocs[edge_id];
-
-                        weight_t a_old = d_src_cluster_weights[old_cluster - base_src_vertex_id];
-                        weight_t a_new = d_dst_cluster_weights[nbr_cluster - base_dst_vertex_id];
-
-                        return 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) -
-                                    resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
-                                      (total_edge_weight * total_edge_weight));
-                      });
-
-    //
-    //  Pick the largest delta_Q value for each vertex on this gpu.
-    //  Then we will shuffle back to the gpu by vertex id
-    //
-    rmm::device_vector<vertex_t> final_src_v(local_cluster_edge_ids_v.size());
-    rmm::device_vector<vertex_t> final_nbr_cluster_v(local_cluster_edge_ids_v.size());
-    rmm::device_vector<weight_t> final_nbr_weights_v(local_cluster_edge_ids_v.size());
-
-    auto final_input_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      thrust::make_permutation_iterator(src_v.begin(), local_cluster_edge_ids_v.begin()),
-      thrust::make_permutation_iterator(nbr_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-      nbr_weights_v.begin()));
-
-    auto final_output_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      final_src_v.begin(), final_nbr_cluster_v.begin(), final_nbr_weights_v.begin()));
-
-    auto final_output_pos =
-      thrust::copy_if(rmm::exec_policy(stream_)->on(stream_),
-                      final_input_iter,
-                      final_input_iter + local_cluster_edge_ids_v.size(),
-                      final_output_iter,
-                      [] __device__(auto p) { return (thrust::get<2>(p) > weight_t{0}); });
-
-    final_src_v.resize(thrust::distance(final_output_iter, final_output_pos));
-    final_nbr_cluster_v.resize(thrust::distance(final_output_iter, final_output_pos));
-    final_nbr_weights_v.resize(thrust::distance(final_output_iter, final_output_pos));
-
-    //
-    // Sort the results, pick the largest version
-    //
-    thrust::sort(rmm::exec_policy(stream_)->on(stream_),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())),
-                 [] __device__(auto left, auto right) {
-                   if (thrust::get<0>(left) < thrust::get<0>(right)) return true;
-                   if (thrust::get<0>(left) > thrust::get<0>(right)) return false;
-                   if (thrust::get<1>(left) > thrust::get<1>(right)) return true;
-                   if (thrust::get<1>(left) < thrust::get<1>(right)) return false;
-                   return (thrust::get<2>(left) < thrust::get<2>(right));
-                 });
-
-    //
-    //  Now that we're sorted the first entry for each src value is the largest.
-    //
-    local_cluster_edge_ids_v.resize(final_src_v.size());
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      thrust::make_counting_iterator<edge_t>(0),
-                      thrust::make_counting_iterator<edge_t>(final_src_v.size()),
-                      local_cluster_edge_ids_v.begin(),
-                      [sentinel = std::numeric_limits<edge_t>::max(),
-                       d_src    = final_src_v.data().get()] __device__(edge_t edge_id) {
-                        if (edge_id == 0) { return edge_id; }
-
-                        if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; }
-
-                        return sentinel;
-                      });
-
-    local_cluster_edge_ids_v = detail::remove_elements_from_vector(
-      local_cluster_edge_ids_v,
-      [sentinel = std::numeric_limits<edge_t>::max()] __device__(auto edge_id) {
-        return (edge_id != sentinel);
+        return thrust::make_tuple(subtract, sum);
       },
-      stream_);
-
-    final_nbr_cluster_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    final_nbr_weights_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_nbr_weights_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    final_src_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    //
-    //  At this point...
-    //     final_src_v contains the source indices
-    //     final_nbr_cluster_v contains the neighboring clusters
-    //     final_nbr_weights_v contains delta_Q for moving src to the neighboring
-    //
-    //  They have been shuffled to the gpus responsible for their source vertex
-    //
-    //  FIXME:  Think about how this should work.
-    //          I think Leiden is broken.  I don't think that the code we have
-    //          actually does anything.  For now I'm going to ignore Leiden in
-    //          MNMG, we can reconsider this later.
-    //
-    //  If we ignore Leiden, I'd like to think about whether the reduction
-    //  should occur now...
-    //
-
-    //
-    // Sort the results, pick the largest version
-    //
-    thrust::sort(rmm::exec_policy(stream_)->on(stream_),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())),
-                 [] __device__(auto left, auto right) {
-                   if (thrust::get<0>(left) < thrust::get<0>(right)) return true;
-                   if (thrust::get<0>(left) > thrust::get<0>(right)) return false;
-                   if (thrust::get<1>(left) > thrust::get<1>(right)) return true;
-                   if (thrust::get<1>(left) < thrust::get<1>(right)) return false;
-                   return (thrust::get<2>(left) < thrust::get<2>(right));
-                 });
-
-    //
-    //  Now that we're sorted (ascending), the last entry for each src value is the largest.
-    //
-    local_cluster_edge_ids_v.resize(final_src_v.size());
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      thrust::make_counting_iterator<edge_t>(0),
-                      thrust::make_counting_iterator<edge_t>(final_src_v.size()),
-                      local_cluster_edge_ids_v.begin(),
-                      [sentinel = std::numeric_limits<edge_t>::max(),
-                       d_src    = final_src_v.data().get()] __device__(edge_t edge_id) {
-                        if (edge_id == 0) { return edge_id; }
-
-                        if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; }
-
-                        return sentinel;
-                      });
-
-    local_cluster_edge_ids_v = detail::remove_elements_from_vector(
-      local_cluster_edge_ids_v,
-      [sentinel = std::numeric_limits<edge_t>::max()] __device__(auto edge_id) {
-        return (edge_id != sentinel);
-      },
-      stream_);
-
-    rmm::device_vector<weight_t> cluster_increase_v(final_src_v.size());
-    rmm::device_vector<weight_t> cluster_decrease_v(final_src_v.size());
-    rmm::device_vector<vertex_t> old_cluster_v(final_src_v.size());
-
-    //
-    //   Then we can, on each gpu, do a local assignment for all of the
-    //   vertices assigned to that gpu using the up_down logic
-    //
-    local_cluster_edge_ids_v = detail::remove_elements_from_vector(
-      local_cluster_edge_ids_v,
-      local_cluster_edge_ids_v.begin(),
-      local_cluster_edge_ids_v.end(),
-      [d_final_src         = final_src_v.data().get(),
-       d_final_nbr_cluster = final_nbr_cluster_v.data().get(),
-       d_final_nbr_weights = final_nbr_weights_v.data().get(),
-       d_cluster_increase  = cluster_increase_v.data().get(),
-       d_cluster_decrease  = cluster_decrease_v.data().get(),
-       d_vertex_weights    = src_vertex_weights_cache_v_.data().get(),
-       d_next_cluster      = next_cluster_v.data().get(),
-       d_old_cluster       = old_cluster_v.data().get(),
-       base_vertex_id      = base_vertex_id_,
-       base_src_vertex_id  = base_src_vertex_id_,
-       up_down] __device__(edge_t idx) {
-        vertex_t src         = d_final_src[idx];
-        vertex_t new_cluster = d_final_nbr_cluster[idx];
-        vertex_t old_cluster = d_next_cluster[src - base_vertex_id];
-        weight_t src_weight  = d_vertex_weights[src - base_src_vertex_id];
-
-        if (d_final_nbr_weights[idx] <= weight_t{0}) return false;
-        if (new_cluster == old_cluster) return false;
-        if ((new_cluster > old_cluster) != up_down) return false;
-
-        d_next_cluster[src - base_vertex_id] = new_cluster;
-        d_cluster_increase[idx]              = src_weight;
-        d_cluster_decrease[idx]              = src_weight;
-        d_old_cluster[idx]                   = old_cluster;
-        return true;
-      },
-      stream_);
-
-    cluster_increase_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(cluster_increase_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                          local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    final_nbr_cluster_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                          local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    cluster_decrease_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(cluster_decrease_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    old_cluster_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(final_nbr_cluster_v.begin(), cluster_increase_v.begin())),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(final_nbr_cluster_v.end(), cluster_increase_v.end())),
-                     [d_cluster_weights = cluster_weights_v_.data().get(),
-                      base_vertex_id    = base_vertex_id_] __device__(auto p) {
-                       vertex_t cluster_id = thrust::get<0>(p);
-                       weight_t weight     = thrust::get<1>(p);
-
-                       atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, weight);
-                     });
-
-    thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
-      thrust::make_zip_iterator(
-        thrust::make_tuple(old_cluster_v.begin(), cluster_decrease_v.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(old_cluster_v.end(), cluster_decrease_v.end())),
-      [d_cluster_weights = cluster_weights_v_.data().get(),
-       base_vertex_id    = base_vertex_id_] __device__(auto p) {
-        vertex_t cluster_id = thrust::get<0>(p);
-        weight_t weight     = thrust::get<1>(p);
-
-        atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, -weight);
-      });
-
-    cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
-  }
-
-  template <typename hash_t, typename compare_t, typename skip_edge_t, typename count_t>
-  std::pair<rmm::device_vector<count_t>, rmm::device_vector<weight_t>>
-  combine_local_src_nbr_cluster_weights(hash_t hasher,
-                                        compare_t compare,
-                                        skip_edge_t skip_edge,
-                                        weight_t const *d_weights,
-                                        count_t num_weights)
-  {
-    rmm::device_vector<count_t> relevant_edges_v;
-    rmm::device_vector<weight_t> relevant_edge_weights_v;
-
-    if (num_weights > 0) {
-      std::size_t capacity{static_cast<std::size_t>(num_weights / 0.7)};
-
-      cuco::static_map<count_t, count_t> hash_map(
-        capacity, std::numeric_limits<count_t>::max(), count_t{0});
-      detail::create_cuco_pair_t<count_t> create_cuco_pair;
-
-      CUDA_TRY(cudaStreamSynchronize(stream_));
-
-      hash_map.insert(thrust::make_transform_iterator(thrust::make_counting_iterator<count_t>(0),
-                                                      create_cuco_pair),
-                      thrust::make_transform_iterator(
-                        thrust::make_counting_iterator<count_t>(num_weights), create_cuco_pair),
-                      hasher,
-                      compare);
-
-      CUDA_TRY(cudaStreamSynchronize(stream_));
-
-      relevant_edges_v.resize(num_weights);
-
-      relevant_edges_v = detail::remove_elements_from_vector(
-        relevant_edges_v,
-        thrust::make_counting_iterator<count_t>(0),
-        thrust::make_counting_iterator<count_t>(num_weights),
-        [d_hash_map = hash_map.get_device_view(), hasher, compare] __device__(count_t idx) {
-          auto pos = d_hash_map.find(idx, hasher, compare);
-          return (pos->first == idx);
-        },
-        stream_);
-
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_counting_iterator<count_t>(0),
-        relevant_edges_v.size(),
-        [d_hash_map = hash_map.get_device_view(),
-         hasher,
-         compare,
-         d_relevant_edges = relevant_edges_v.data().get()] __device__(count_t idx) mutable {
-          count_t edge_id = d_relevant_edges[idx];
-          auto pos        = d_hash_map.find(edge_id, hasher, compare);
-          pos->second.store(idx);
-        });
-
-      relevant_edge_weights_v.resize(relevant_edges_v.size());
-      thrust::fill(rmm::exec_policy(stream_)->on(stream_),
-                   relevant_edge_weights_v.begin(),
-                   relevant_edge_weights_v.end(),
-                   weight_t{0});
-
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_counting_iterator<count_t>(0),
-        num_weights,
-        [d_hash_map = hash_map.get_device_view(),
-         hasher,
-         compare,
-         skip_edge,
-         d_relevant_edge_weights = relevant_edge_weights_v.data().get(),
-         d_weights] __device__(count_t idx) {
-          if (!skip_edge(idx)) {
-            auto pos = d_hash_map.find(idx, hasher, compare);
-            if (pos != d_hash_map.end()) {
-              atomicAdd(d_relevant_edge_weights + pos->second.load(cuda::std::memory_order_relaxed),
-                        d_weights[idx]);
-            }
-          }
-        });
-    }
-
-    return std::make_pair(relevant_edges_v, relevant_edge_weights_v);
+      thrust::make_tuple(weight_t{0}, weight_t{0}),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer));
+
+    thrust::transform(
+      rmm::exec_policy(handle_.get_stream_view()),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer) +
+        current_graph_view_.get_number_of_local_vertices(),
+      old_cluster_sum_v.begin(),
+      [] __device__(auto p) { return thrust::get<1>(p); });
+
+    thrust::transform(
+      rmm::exec_policy(handle_.get_stream_view()),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer) +
+        current_graph_view_.get_number_of_local_vertices(),
+      cluster_subtract_v.begin(),
+      [] __device__(auto p) { return thrust::get<0>(p); });
   }
 
-  void shrink_graph(vertex_t *d_cluster_vec)
-  {
-    timer_start("shrinking graph");
-
-    std::size_t capacity{static_cast<std::size_t>((local_num_rows_ + local_num_cols_) / 0.7)};
-
-    cuco::static_map<vertex_t, vertex_t> hash_map(
-      capacity, std::numeric_limits<vertex_t>::max(), std::numeric_limits<vertex_t>::max());
-
-    // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(hash_map);
-
-    renumber_result(hash_map, d_cluster_vec, num_clusters);
-
-    // shrink our graph to represent the graph of supervertices
-    generate_supervertices_graph(hash_map, num_clusters);
-
-    // assign each new vertex to its own cluster
-    //  MNMG:  This can be done locally with no communication required
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
-                     cluster_v_.begin(),
-                     cluster_v_.end(),
-                     base_vertex_id_);
-
-    timer_stop(stream_);
-  }
-
-  vertex_t renumber_clusters(cuco::static_map<vertex_t, vertex_t> &hash_map)
+  void update_by_delta_modularity(weight_t total_edge_weight,
+                                  weight_t resolution,
+                                  rmm::device_uvector<vertex_t>& next_cluster_v,
+                                  bool up_down)
   {
-    rmm::device_vector<vertex_t> cluster_inverse_v(local_num_vertices_, vertex_t{0});
-
-    //
-    // FIXME:  Faster to iterate from graph_.get_vertex_partition_first()
-    //         to graph_.get_vertex_partition_last()?  That would potentially
-    //         result in adding a cluster that isn't used on this GPU,
-    //         although I don't think it would break the result in any way.
-    //
-    //         This would also eliminate this use of src_indices_v_.
-    //
-    auto it_src = thrust::make_transform_iterator(
-      src_indices_v_.begin(),
-      [base_src_vertex_id  = base_src_vertex_id_,
-       d_src_cluster_cache = src_cluster_cache_v_.data().get()] __device__(auto idx) {
-        return detail::create_cuco_pair_t<vertex_t>()(
-          d_src_cluster_cache[idx - base_src_vertex_id]);
-      });
+    rmm::device_uvector<weight_t> old_cluster_sum_v(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
+    rmm::device_uvector<weight_t> cluster_subtract_v(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
+    rmm::device_uvector<weight_t> src_cluster_weights_v(next_cluster_v.size(),
+                                                        handle_.get_stream());
 
-    auto it_dst = thrust::make_transform_iterator(
-      current_graph_view_.indices(),
-      [base_dst_vertex_id  = base_dst_vertex_id_,
-       d_dst_cluster_cache = dst_cluster_cache_v_.data().get()] __device__(auto idx) {
-        return detail::create_cuco_pair_t<vertex_t>()(
-          d_dst_cluster_cache[idx - base_dst_vertex_id]);
-      });
+    compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v);
 
-    hash_map.insert(it_src, it_src + local_num_edges_);
-    hash_map.insert(it_dst, it_dst + local_num_edges_);
+    auto output_buffer =
+      cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
+        current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
 
-    // Now I need to get the keys into an array and shuffle them
-    rmm::device_vector<vertex_t> used_cluster_ids_v(hash_map.get_size());
+    vertex_t* map_key_first;
+    vertex_t* map_key_last;
+    weight_t* map_value_first;
 
-    auto transform_iter = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<std::size_t>(0),
-      [d_hash_map = hash_map.get_device_view()] __device__(std::size_t idx) {
-        return d_hash_map.begin_slot()[idx].first.load();
-      });
-
-    used_cluster_ids_v = detail::remove_elements_from_vector(
-      used_cluster_ids_v,
-      transform_iter,
-      transform_iter + hash_map.get_capacity(),
-      [vmax = std::numeric_limits<vertex_t>::max()] __device__(vertex_t cluster) {
-        return cluster != vmax;
-      },
-      stream_);
-
-    auto partition_cluster_ids_iter = thrust::make_transform_iterator(
-      used_cluster_ids_v.begin(),
-      [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-        return d_vertex_device_view(v);
-      });
-
-    rmm::device_vector<std::size_t> original_gpus_v;
-    rmm::device_vector<vertex_t> my_cluster_ids_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, used_cluster_ids_v.size(), used_cluster_ids_v.begin(), partition_cluster_ids_iter);
+    if (graph_t::is_multi_gpu) {
+      cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t> vertex_to_gpu_id_op{
+        handle_.get_comms().get_size()};
+
+      src_cluster_weights_v = cugraph::experimental::collect_values_for_keys(
+        handle_.get_comms(),
+        cluster_keys_v_.begin(),
+        cluster_keys_v_.end(),
+        cluster_weights_v_.data(),
+        d_src_cluster_cache_,
+        d_src_cluster_cache_ + src_cluster_cache_v_.size(),
+        vertex_to_gpu_id_op,
+        handle_.get_stream());
+
+      map_key_first   = cluster_keys_v_.begin();
+      map_key_last    = cluster_keys_v_.end();
+      map_value_first = cluster_weights_v_.begin();
+    } else {
+      thrust::sort_by_key(rmm::exec_policy(handle_.get_stream_view()),
+                          cluster_keys_v_.begin(),
+                          cluster_keys_v_.end(),
+                          cluster_weights_v_.begin());
+
+      thrust::transform(rmm::exec_policy(handle_.get_stream_view()),
+                        next_cluster_v.begin(),
+                        next_cluster_v.end(),
+                        src_cluster_weights_v.begin(),
+                        [d_cluster_weights = cluster_weights_v_.data(),
+                         d_cluster_keys    = cluster_keys_v_.data(),
+                         num_clusters      = cluster_keys_v_.size()] __device__(vertex_t cluster) {
+                          auto pos = thrust::lower_bound(
+                            thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster);
+                          return d_cluster_weights[pos - d_cluster_keys];
+                        });
 
-    if (graph_view_t::is_multi_gpu) {
-      original_gpus_v = variable_shuffle<graph_view_t::is_multi_gpu, std::size_t>(
-        handle_,
-        used_cluster_ids_v.size(),
-        thrust::make_constant_iterator<std::size_t>(rank_),
-        partition_cluster_ids_iter);
+      map_key_first   = d_src_cluster_cache_;
+      map_key_last    = d_src_cluster_cache_ + src_cluster_weights_v.size();
+      map_value_first = src_cluster_weights_v.begin();
     }
 
-    //
-    //   Now my_cluster_ids contains the cluster ids that this gpu is
-    //   responsible for. I'm going to set cluster_inverse_v to one
-    //   for each cluster in this list.
-    //
-    thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
-      my_cluster_ids_v.begin(),
-      my_cluster_ids_v.end(),
-      [base_vertex_id    = base_vertex_id_,
-       d_cluster_inverse = cluster_inverse_v.data().get()] __device__(vertex_t cluster) {
-        d_cluster_inverse[cluster - base_vertex_id] = 1;
-      });
+    rmm::device_uvector<weight_t> src_old_cluster_sum_v(
+      current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream());
+    rmm::device_uvector<weight_t> src_cluster_subtract_v(
+      current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream());
+    copy_to_adj_matrix_row(
+      handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin());
+    copy_to_adj_matrix_row(
+      handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin());
 
-    rmm::device_vector<vertex_t> my_cluster_ids_deduped_v = detail::remove_elements_from_vector(
-      my_cluster_ids_v,
-      thrust::make_counting_iterator<size_t>(0),
-      thrust::make_counting_iterator<size_t>(cluster_inverse_v.size()),
-      [d_cluster_inverse = cluster_inverse_v.data().get()] __device__(auto idx) {
-        return d_cluster_inverse[idx] == 1;
+    copy_v_transform_reduce_key_aggregated_out_nbr(
+      handle_,
+      current_graph_view_,
+      thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(),
+                                                   d_src_vertex_weights_cache_,
+                                                   src_cluster_subtract_v.begin(),
+                                                   d_src_cluster_cache_,
+                                                   src_cluster_weights_v.begin())),
+
+      d_dst_cluster_cache_,
+      map_key_first,
+      map_key_last,
+      map_value_first,
+      [total_edge_weight, resolution] __device__(
+        auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) {
+        auto old_cluster_sum  = thrust::get<0>(src_info);
+        auto k_k              = thrust::get<1>(src_info);
+        auto cluster_subtract = thrust::get<2>(src_info);
+        auto src_cluster      = thrust::get<3>(src_info);
+        auto a_old            = thrust::get<4>(src_info);
+
+        if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
+
+        weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) -
+                                         resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
+                                           (total_edge_weight * total_edge_weight));
+
+        return thrust::make_tuple(neighbor_cluster, delta_modularity);
       },
-      stream_);
-
-    //
-    //  Need to gather everything to be able to compute base addresses
-    //
-    vertex_t base_address{0};
+      [] __device__(auto p1, auto p2) {
+        auto id1 = thrust::get<0>(p1);
+        auto id2 = thrust::get<0>(p2);
+        auto wt1 = thrust::get<1>(p1);
+        auto wt2 = thrust::get<1>(p2);
 
-    if (graph_view_t::is_multi_gpu) {
-      int num_gpus{1};
-      rmm::device_vector<std::size_t> sizes_v(num_gpus + 1, my_cluster_ids_deduped_v.size());
-
-      handle_.get_comms().allgather(
-        sizes_v.data().get() + num_gpus, sizes_v.data().get(), num_gpus, stream_);
-
-      base_address = thrust::reduce(rmm::exec_policy(stream_)->on(stream_),
-                                    sizes_v.begin(),
-                                    sizes_v.begin() + rank_,
-                                    vertex_t{0});
-    }
-
-    //
-    //  Now let's update cluster_inverse_v to contain
-    //  the mapping of old cluster id to new vertex id
-    //
-    thrust::fill(
-      cluster_inverse_v.begin(), cluster_inverse_v.end(), std::numeric_limits<vertex_t>::max());
-
-    thrust::for_each_n(rmm::exec_policy(stream_)->on(stream_),
-                       thrust::make_counting_iterator<std::size_t>(0),
-                       my_cluster_ids_deduped_v.size(),
-                       [base_address,
-                        d_my_cluster_ids_deduped = my_cluster_ids_deduped_v.data().get(),
-                        d_cluster_inverse = cluster_inverse_v.data().get()] __device__(auto idx) {
-                         d_cluster_inverse[d_my_cluster_ids_deduped[idx]] = idx + base_address;
-                       });
-
-    //
-    //  Now I need to shuffle back to original gpus the
-    //  subset of my mapping that is required
-    //
-    rmm::device_vector<vertex_t> new_vertex_ids_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_,
-        my_cluster_ids_v.size(),
-        thrust::make_transform_iterator(my_cluster_ids_v.begin(),
-                                        [d_cluster_inverse = cluster_inverse_v.data().get(),
-                                         base_vertex_id    = base_vertex_id_] __device__(auto v) {
-                                          return d_cluster_inverse[v - base_vertex_id];
-                                        }),
-        original_gpus_v.begin());
-
-    if (graph_view_t::is_multi_gpu) {
-      my_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, my_cluster_ids_v.size(), my_cluster_ids_v.begin(), original_gpus_v.begin());
-    }
-
-    //
-    //  Now update the hash map with the new vertex id
-    //
-    thrust::for_each_n(rmm::exec_policy(stream_)->on(stream_),
-                       thrust::make_zip_iterator(
-                         thrust::make_tuple(my_cluster_ids_v.begin(), new_vertex_ids_v.begin())),
-                       my_cluster_ids_v.size(),
-                       [d_hash_map = hash_map.get_device_view()] __device__(auto p) mutable {
-                         auto pos = d_hash_map.find(thrust::get<0>(p));
-                         pos->second.store(thrust::get<1>(p));
-                       });
-
-    //
-    //  At this point we have a renumbered COO that is
-    //  improperly distributed around the cluster, which
-    //  will be fixed by generate_supervertices_graph
-    //
-    if (graph_t::is_multi_gpu) {
-      return host_scalar_allreduce(
-        handle_.get_comms(), static_cast<vertex_t>(my_cluster_ids_deduped_v.size()), stream_);
-    } else {
-      return static_cast<vertex_t>(my_cluster_ids_deduped_v.size());
-    }
-  }
-
-  void renumber_result(cuco::static_map<vertex_t, vertex_t> const &hash_map,
-                       vertex_t *d_cluster_vec,
-                       vertex_t num_clusters)
-  {
-    if (graph_view_t::is_multi_gpu) {
-      //
-      // FIXME: Perhaps there's a general purpose function hidden here...
-      //        Given a set of vertex_t values, and a distributed set of
-      //        vertex properties, go to the proper node and retrieve
-      //        the vertex properties and return them to this gpu.
-      //
-      std::size_t capacity{static_cast<std::size_t>((local_num_vertices_) / 0.7)};
-      cuco::static_map<vertex_t, vertex_t> result_hash_map(
-        capacity, std::numeric_limits<vertex_t>::max(), std::numeric_limits<vertex_t>::max());
-
-      auto cluster_iter = thrust::make_transform_iterator(d_cluster_vec, [] __device__(vertex_t c) {
-        return detail::create_cuco_pair_t<vertex_t>()(c);
+        return (wt1 < wt2) ? p2 : ((wt1 > wt2) ? p1 : ((id1 < id2) ? p1 : p2));
+      },
+      thrust::make_tuple(vertex_t{-1}, weight_t{0}),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
+        output_buffer));
+
+    thrust::transform(
+      rmm::exec_policy(handle_.get_stream_view()),
+      next_cluster_v.begin(),
+      next_cluster_v.end(),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
+        output_buffer),
+      next_cluster_v.begin(),
+      [up_down] __device__(vertex_t old_cluster, auto p) {
+        vertex_t new_cluster      = thrust::get<0>(p);
+        weight_t delta_modularity = thrust::get<1>(p);
+
+        return (delta_modularity > weight_t{0})
+                 ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
+                 : old_cluster;
       });
 
-      result_hash_map.insert(cluster_iter, cluster_iter + local_num_vertices_);
-
-      rmm::device_vector<vertex_t> used_cluster_ids_v(result_hash_map.get_size());
-
-      auto transform_iter = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<std::size_t>(0),
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(std::size_t idx) {
-          return d_result_hash_map.begin_slot()[idx].first.load();
-        });
-
-      used_cluster_ids_v = detail::remove_elements_from_vector(
-        used_cluster_ids_v,
-        transform_iter,
-        transform_iter + result_hash_map.get_capacity(),
-        [vmax = std::numeric_limits<vertex_t>::max()] __device__(vertex_t cluster) {
-          return cluster != vmax;
-        },
-        stream_);
-
-      auto partition_cluster_ids_iter = thrust::make_transform_iterator(
-        used_cluster_ids_v.begin(),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        });
-
-      rmm::device_vector<vertex_t> old_cluster_ids_v =
-        variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(handle_,
-                                                               used_cluster_ids_v.size(),
-                                                               used_cluster_ids_v.begin(),
-                                                               partition_cluster_ids_iter);
-
-      rmm::device_vector<std::size_t> original_gpus_v =
-        variable_shuffle<graph_view_t::is_multi_gpu, std::size_t>(
-          handle_,
-          used_cluster_ids_v.size(),
-          thrust::make_constant_iterator<std::size_t>(rank_),
-          partition_cluster_ids_iter);
-
-      // Now each GPU has old cluster ids, let's compute new cluster ids
-      rmm::device_vector<vertex_t> new_cluster_ids_v(old_cluster_ids_v.size());
-
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        old_cluster_ids_v.begin(),
-                        old_cluster_ids_v.end(),
-                        new_cluster_ids_v.begin(),
-                        [base_vertex_id = base_vertex_id_,
-                         d_cluster      = cluster_v_.data().get(),
-                         d_hash_map = hash_map.get_device_view()] __device__(vertex_t cluster_id) {
-                          vertex_t c = d_cluster[cluster_id - base_vertex_id];
-                          auto pos   = d_hash_map.find(c);
-                          return pos->second.load();
-                        });
-
-      // Shuffle everything back
-      old_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, old_cluster_ids_v.size(), old_cluster_ids_v.begin(), original_gpus_v.begin());
-      new_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, new_cluster_ids_v.size(), new_cluster_ids_v.begin(), original_gpus_v.begin());
-
-      // Update result_hash_map
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_zip_iterator(
-          thrust::make_tuple(old_cluster_ids_v.begin(), new_cluster_ids_v.begin())),
-        old_cluster_ids_v.size(),
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(auto pair) mutable {
-          auto pos = d_result_hash_map.find(thrust::get<0>(pair));
-          pos->second.store(thrust::get<1>(pair));
-        });
-
-      thrust::transform(
-        rmm::exec_policy(stream_)->on(stream_),
-        d_cluster_vec,
-        d_cluster_vec + number_of_vertices_,
-        d_cluster_vec,
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(vertex_t c) {
-          auto pos = d_result_hash_map.find(c);
-          return pos->second.load();
-        });
+    d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_);
+    d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_);
 
-    } else {
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        d_cluster_vec,
-                        d_cluster_vec + number_of_vertices_,
-                        d_cluster_vec,
-                        [d_hash_map    = hash_map.get_device_view(),
-                         d_dst_cluster = dst_cluster_cache_v_.data()] __device__(vertex_t v) {
-                          vertex_t c = d_dst_cluster[v];
-                          auto pos   = d_hash_map.find(c);
-                          return pos->second.load();
-                        });
-    }
+    std::tie(cluster_keys_v_, cluster_weights_v_) =
+      cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e(
+        handle_,
+        current_graph_view_,
+        thrust::make_constant_iterator(0),
+        thrust::make_constant_iterator(0),
+        d_src_cluster_cache_,
+        [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; },
+        weight_t{0});
   }
 
-  void generate_supervertices_graph(cuco::static_map<vertex_t, vertex_t> const &hash_map,
-                                    vertex_t num_clusters)
+  void shrink_graph()
   {
-    rmm::device_vector<vertex_t> new_src_v(local_num_edges_);
-    rmm::device_vector<vertex_t> new_dst_v(local_num_edges_);
-    rmm::device_vector<weight_t> new_weight_v(current_graph_view_.weights(),
-                                              current_graph_view_.weights() + local_num_edges_);
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      src_indices_v_.begin(),
-                      src_indices_v_.end(),
-                      new_src_v.begin(),
-                      [base_src_vertex_id = base_src_vertex_id_,
-                       d_src_cluster      = src_cluster_cache_v_.data().get(),
-                       d_hash_map         = hash_map.get_device_view()] __device__(vertex_t v) {
-                        vertex_t c = d_src_cluster[v - base_src_vertex_id];
-                        auto pos   = d_hash_map.find(c);
-                        return pos->second.load();
-                      });
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      current_graph_view_.indices(),
-                      current_graph_view_.indices() + local_num_edges_,
-                      new_dst_v.begin(),
-                      [base_dst_vertex_id = base_dst_vertex_id_,
-                       d_dst_cluster      = dst_cluster_cache_v_.data().get(),
-                       d_hash_map         = hash_map.get_device_view()] __device__(vertex_t v) {
-                        vertex_t c = d_dst_cluster[v - base_dst_vertex_id];
-                        auto pos   = d_hash_map.find(c);
-                        return pos->second.load();
-                      });
-
-    // Combine common edges on local gpu
-    std::tie(new_src_v, new_dst_v, new_weight_v) =
-      combine_local_edges(new_src_v, new_dst_v, new_weight_v);
-
-    if (graph_view_t::is_multi_gpu) {
-      //
-      // Shuffle the data to the proper GPU
-      //   FIXME:  This needs some performance exploration.  It is
-      //           possible (likely?) that the shrunken graph is
-      //           more dense than the original graph.  Perhaps that
-      //           changes the dynamic of partitioning efficiently.
-      //
-      // For now, we're going to keep the partitioning the same,
-      // but because we've renumbered to lower numbers, fewer
-      // partitions will actually have data.
-      //
-      rmm::device_vector<int> partition_v(new_src_v.size());
-
-      thrust::transform(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_dst_v.begin())),
-        thrust::make_zip_iterator(thrust::make_tuple(new_src_v.end(), new_dst_v.end())),
-        partition_v.begin(),
-        [d_edge_device_view = compute_partition_.edge_device_view()] __device__(
-          thrust::tuple<vertex_t, vertex_t> tuple) {
-          return d_edge_device_view(thrust::get<0>(tuple), thrust::get<1>(tuple));
-        });
-
-      new_src_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, partition_v.size(), new_src_v.begin(), partition_v.begin());
-
-      new_dst_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, partition_v.size(), new_dst_v.begin(), partition_v.begin());
+    timer_start("shrinking graph");
 
-      new_weight_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-        handle_, partition_v.size(), new_weight_v.begin(), partition_v.begin());
+    rmm::device_uvector<vertex_t> numbering_map(0, handle_.get_stream());
 
-      //
-      //  Now everything is on the correct node, again combine like edges
-      //
-      std::tie(new_src_v, new_dst_v, new_weight_v) =
-        combine_local_edges(new_src_v, new_dst_v, new_weight_v);
-    }
-
-    //
-    //  Now I have a COO of the new graph, distributed according to the
-    //  original clustering (eventually this likely fits on one GPU and
-    //  everything else is empty).
-    //
-    current_graph_ =
-      detail::create_graph<vertex_t,
-                           edge_t,
-                           weight_t,
-                           graph_t::is_adj_matrix_transposed,
-                           graph_t::is_multi_gpu>(handle_,
-                                                  new_src_v,
-                                                  new_dst_v,
-                                                  new_weight_v,
-                                                  num_clusters,
-                                                  experimental::graph_properties_t{true, true},
-                                                  current_graph_view_);
+    std::tie(current_graph_, numbering_map) =
+      coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin());
 
     current_graph_view_ = current_graph_->view();
 
-    src_indices_v_.resize(new_src_v.size());
-
-    local_num_vertices_ = current_graph_view_.get_number_of_local_vertices();
-    local_num_rows_     = current_graph_view_.get_number_of_local_adj_matrix_partition_rows();
-    local_num_cols_     = current_graph_view_.get_number_of_local_adj_matrix_partition_cols();
-    local_num_edges_    = new_src_v.size();
+    rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), handle_.get_stream());
+    thrust::sequence(rmm::exec_policy(handle_.get_stream_view()),
+                     numbering_indices.begin(),
+                     numbering_indices.end(),
+                     current_graph_view_.get_local_vertex_first());
 
-    cugraph::detail::offsets_to_indices(
-      current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get());
-  }
+    relabel<vertex_t, graph_view_t::is_multi_gpu>(
+      handle_,
+      std::make_tuple(static_cast<vertex_t const*>(numbering_map.begin()),
+                      static_cast<vertex_t const*>(numbering_indices.begin())),
+      current_graph_view_.get_number_of_local_vertices(),
+      dendrogram_->current_level_begin(),
+      dendrogram_->current_level_size(),
+      false);
 
-  std::
-    tuple<rmm::device_vector<vertex_t>, rmm::device_vector<vertex_t>, rmm::device_vector<weight_t>>
-    combine_local_edges(rmm::device_vector<vertex_t> &src_v,
-                        rmm::device_vector<vertex_t> &dst_v,
-                        rmm::device_vector<weight_t> &weight_v)
-  {
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream_)->on(stream_),
-      dst_v.begin(),
-      dst_v.end(),
-      thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), weight_v.begin())));
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream_)->on(stream_),
-      src_v.begin(),
-      src_v.end(),
-      thrust::make_zip_iterator(thrust::make_tuple(dst_v.begin(), weight_v.begin())));
-
-    rmm::device_vector<vertex_t> combined_src_v(src_v.size());
-    rmm::device_vector<vertex_t> combined_dst_v(src_v.size());
-    rmm::device_vector<weight_t> combined_weight_v(src_v.size());
-
-    //
-    //  Now we reduce by key to combine the weights of duplicate
-    //  edges.
-    //
-    auto start = thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), dst_v.begin()));
-    auto new_start =
-      thrust::make_zip_iterator(thrust::make_tuple(combined_src_v.begin(), combined_dst_v.begin()));
-    auto new_end = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
-                                         start,
-                                         start + src_v.size(),
-                                         weight_v.begin(),
-                                         new_start,
-                                         combined_weight_v.begin(),
-                                         thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
-                                         thrust::plus<weight_t>());
-
-    auto num_edges = thrust::distance(new_start, new_end.first);
-
-    combined_src_v.resize(num_edges);
-    combined_dst_v.resize(num_edges);
-    combined_weight_v.resize(num_edges);
-
-    return std::make_tuple(combined_src_v, combined_dst_v, combined_weight_v);
+    timer_stop(handle_.get_stream());
   }
 
  protected:
-  raft::handle_t const &handle_;
-  cudaStream_t stream_;
-
-  vertex_t number_of_vertices_;
-  vertex_t base_vertex_id_{0};
-  vertex_t base_src_vertex_id_{0};
-  vertex_t base_dst_vertex_id_{0};
-  int rank_{0};
+  raft::handle_t const& handle_;
 
-  vertex_t local_num_vertices_;
-  vertex_t local_num_rows_;
-  vertex_t local_num_cols_;
-  edge_t local_num_edges_;
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
 
   //
-  //  Copy of graph
+  //  Initially we run on the input graph view,
+  //  but as we shrink the graph we'll keep the
+  //  current graph here
   //
   std::unique_ptr<graph_t> current_graph_{};
   graph_view_t current_graph_view_;
 
-  //
-  //  For partitioning
-  //
-  detail::compute_partition_t<graph_view_t> compute_partition_;
+  rmm::device_uvector<weight_t> vertex_weights_v_;
+  rmm::device_uvector<weight_t> src_vertex_weights_cache_v_;
+  rmm::device_uvector<vertex_t> src_cluster_cache_v_;
+  rmm::device_uvector<vertex_t> dst_cluster_cache_v_;
+  rmm::device_uvector<vertex_t> cluster_keys_v_;
+  rmm::device_uvector<weight_t> cluster_weights_v_;
 
-  rmm::device_vector<vertex_t> src_indices_v_;
-
-  //
-  //  Weights and clustering across iterations of algorithm
-  //
-  rmm::device_vector<weight_t> vertex_weights_v_;
-  rmm::device_vector<weight_t> src_vertex_weights_cache_v_{};
-  rmm::device_vector<weight_t> dst_vertex_weights_cache_v_{};
-
-  rmm::device_vector<weight_t> cluster_weights_v_;
-  rmm::device_vector<weight_t> src_cluster_weights_cache_v_{};
-  rmm::device_vector<weight_t> dst_cluster_weights_cache_v_{};
-
-  rmm::device_vector<vertex_t> cluster_v_;
-  rmm::device_vector<vertex_t> src_cluster_cache_v_{};
-  rmm::device_vector<vertex_t> dst_cluster_cache_v_{};
-
-  rmm::device_vector<weight_t> empty_cache_weight_v_{};
+  weight_t* d_src_vertex_weights_cache_;
+  vertex_t* d_src_cluster_cache_;
+  vertex_t* d_dst_cluster_cache_;
 
 #ifdef TIMING
   HighResTimer hr_timer_;
 #endif
-};  // namespace experimental
+};
 
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/experimental/pagerank.cu b/cpp/src/experimental/pagerank.cu
index 1aa7f37fa6b..7c3e4b03e9e 100644
--- a/cpp/src/experimental/pagerank.cu
+++ b/cpp/src/experimental/pagerank.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,15 @@
  * limitations under the License.
  */
 
-#include <algorithms.hpp>
-#include <experimental/graph_view.hpp>
-#include <patterns/any_of_adj_matrix_row.cuh>
-#include <patterns/copy_to_adj_matrix_row_col.cuh>
-#include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
-#include <patterns/count_if_e.cuh>
-#include <patterns/count_if_v.cuh>
-#include <patterns/reduce_v.cuh>
-#include <patterns/transform_reduce_v.cuh>
-#include <utilities/error.hpp>
-#include <vertex_partition_device.cuh>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <cugraph/prims/count_if_e.cuh>
+#include <cugraph/prims/count_if_v.cuh>
+#include <cugraph/prims/reduce_v.cuh>
+#include <cugraph/prims/transform_reduce_v.cuh>
+#include <cugraph/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
@@ -42,18 +40,19 @@ namespace detail {
 
 // FIXME: personalization_vector_size is confusing in OPG (local or aggregate?)
 template <typename GraphViewType, typename result_t>
-void pagerank(raft::handle_t const& handle,
-              GraphViewType const& pull_graph_view,
-              typename GraphViewType::weight_type* precomputed_vertex_out_weight_sums,
-              typename GraphViewType::vertex_type* personalization_vertices,
-              result_t* personalization_values,
-              typename GraphViewType::vertex_type personalization_vector_size,
-              result_t* pageranks,
-              result_t alpha,
-              result_t epsilon,
-              size_t max_iterations,
-              bool has_initial_guess,
-              bool do_expensive_check)
+void pagerank(
+  raft::handle_t const& handle,
+  GraphViewType const& pull_graph_view,
+  std::optional<typename GraphViewType::weight_type const*> precomputed_vertex_out_weight_sums,
+  std::optional<typename GraphViewType::vertex_type const*> personalization_vertices,
+  std::optional<result_t const*> personalization_values,
+  std::optional<typename GraphViewType::vertex_type> personalization_vector_size,
+  result_t* pageranks,
+  result_t alpha,
+  result_t epsilon,
+  size_t max_iterations,
+  bool has_initial_guess,
+  bool do_expensive_check)
 {
   using vertex_t = typename GraphViewType::vertex_type;
   using weight_t = typename GraphViewType::weight_type;
@@ -69,27 +68,28 @@ void pagerank(raft::handle_t const& handle,
   if (num_vertices == 0) { return; }
 
   auto aggregate_personalization_vector_size =
-    GraphViewType::is_multi_gpu
-      ? host_scalar_allreduce(handle.get_comms(), personalization_vector_size, handle.get_stream())
-      : personalization_vector_size;
+    personalization_vertices
+      ? GraphViewType::is_multi_gpu
+          ? host_scalar_allreduce(
+              handle.get_comms(), *personalization_vector_size, handle.get_stream())
+          : *personalization_vector_size
+      : vertex_t{0};
 
   // 1. check input arguments
 
-  CUGRAPH_EXPECTS(
-    ((personalization_vector_size > 0) && (personalization_vertices != nullptr) &&
-     (personalization_values != nullptr)) ||
-      ((personalization_vector_size == 0) && (personalization_vertices == nullptr) &&
-       (personalization_values == nullptr)),
-    "Invalid input argument: if personalization_vector_size is non-zero, personalization verties "
-    "and personalization values should be provided. Otherwise, they should not be provided.");
+  CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) ||
+                    (personalization_values.has_value() && personalization_vector_size.has_value()),
+                  "Invalid input argument: if personalization_vertices.has_value() is true, "
+                  "personalization_values.has_value() and personalization_vector_size.has_value() "
+                  "should be true as well.");
   CUGRAPH_EXPECTS((alpha >= 0.0) && (alpha <= 1.0),
                   "Invalid input argument: alpha should be in [0.0, 1.0].");
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
 
   if (do_expensive_check) {
-    if (precomputed_vertex_out_weight_sums != nullptr) {
+    if (precomputed_vertex_out_weight_sums) {
       auto num_negative_precomputed_vertex_out_weight_sums = count_if_v(
-        handle, pull_graph_view, precomputed_vertex_out_weight_sums, [] __device__(auto val) {
+        handle, pull_graph_view, *precomputed_vertex_out_weight_sums, [] __device__(auto val) {
           return val < result_t{0.0};
         });
       CUGRAPH_EXPECTS(
@@ -118,12 +118,13 @@ void pagerank(raft::handle_t const& handle,
     }
 
     if (aggregate_personalization_vector_size > 0) {
-      vertex_partition_device_t<GraphViewType> vertex_partition(pull_graph_view);
+      auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+        pull_graph_view.get_vertex_partition_view());
       auto num_invalid_vertices =
         count_if_v(handle,
                    pull_graph_view,
-                   personalization_vertices,
-                   personalization_vertices + personalization_vector_size,
+                   *personalization_vertices,
+                   *personalization_vertices + *personalization_vector_size,
                    [vertex_partition] __device__(auto val) {
                      return !(vertex_partition.is_valid_vertex(val) &&
                               vertex_partition.is_local_vertex_nocheck(val));
@@ -132,8 +133,8 @@ void pagerank(raft::handle_t const& handle,
                       "Invalid input argument: peresonalization vertices have invalid vertex IDs.");
       auto num_negative_values = count_if_v(handle,
                                             pull_graph_view,
-                                            personalization_values,
-                                            personalization_values + personalization_vector_size,
+                                            *personalization_values,
+                                            *personalization_values + *personalization_vector_size,
                                             [] __device__(auto val) { return val < 0.0; });
       CUGRAPH_EXPECTS(num_negative_values == 0,
                       "Invalid input argument: peresonalization values should be non-negative.");
@@ -142,34 +143,21 @@ void pagerank(raft::handle_t const& handle,
 
   // 2. compute the sums of the out-going edge weights (if not provided)
 
-  rmm::device_uvector<weight_t> tmp_vertex_out_weight_sums(0, handle.get_stream());
-  if (precomputed_vertex_out_weight_sums == nullptr) {
-    tmp_vertex_out_weight_sums.resize(pull_graph_view.get_number_of_local_vertices(),
-                                      handle.get_stream());
-    // FIXME: better refactor this out (computing out-degree).
-    copy_v_transform_reduce_out_nbr(
-      handle,
-      pull_graph_view,
-      thrust::make_constant_iterator(0) /* dummy */,
-      thrust::make_constant_iterator(0) /* dummy */,
-      [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
-        return w;
-      },
-      weight_t{0.0},
-      tmp_vertex_out_weight_sums.data());
-  }
-
-  auto vertex_out_weight_sums = precomputed_vertex_out_weight_sums != nullptr
-                                  ? precomputed_vertex_out_weight_sums
-                                  : tmp_vertex_out_weight_sums.data();
+  auto tmp_vertex_out_weight_sums = precomputed_vertex_out_weight_sums
+                                      ? std::nullopt
+                                      : std::optional<rmm::device_uvector<weight_t>>{
+                                          pull_graph_view.compute_out_weight_sums(handle)};
+  auto vertex_out_weight_sums     = precomputed_vertex_out_weight_sums
+                                      ? *precomputed_vertex_out_weight_sums
+                                      : (*tmp_vertex_out_weight_sums).data();
 
   // 3. initialize pagerank values
 
   if (has_initial_guess) {
     auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0});
-    CUGRAPH_EXPECTS(
-      sum > 0.0,
-      "Invalid input argument: sum of the PageRank initial guess values should be positive.");
+    CUGRAPH_EXPECTS(sum > 0.0,
+                    "Invalid input argument: sum of the PageRank initial "
+                    "guess values should be positive.");
     thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                       pageranks,
                       pageranks + pull_graph_view.get_number_of_local_vertices(),
@@ -188,11 +176,12 @@ void pagerank(raft::handle_t const& handle,
   if (aggregate_personalization_vector_size > 0) {
     personalization_sum = reduce_v(handle,
                                    pull_graph_view,
-                                   personalization_values,
-                                   personalization_values + personalization_vector_size,
+                                   *personalization_values,
+                                   *personalization_values + *personalization_vector_size,
                                    result_t{0.0});
     CUGRAPH_EXPECTS(personalization_sum > 0.0,
-                    "Invalid input argument: sum of personalization valuese should be positive.");
+                    "Invalid input argument: sum of personalization valuese "
+                    "should be positive.");
   }
 
   // 5. pagerank iteration
@@ -254,13 +243,14 @@ void pagerank(raft::handle_t const& handle,
       pageranks);
 
     if (aggregate_personalization_vector_size > 0) {
-      vertex_partition_device_t<GraphViewType> vertex_partition(pull_graph_view);
+      auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+        pull_graph_view.get_vertex_partition_view());
       auto val_first = thrust::make_zip_iterator(
-        thrust::make_tuple(personalization_vertices, personalization_values));
+        thrust::make_tuple(*personalization_vertices, *personalization_values));
       thrust::for_each(
         rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
         val_first,
-        val_first + personalization_vector_size,
+        val_first + *personalization_vector_size,
         [vertex_partition, pageranks, dangling_sum, personalization_sum, alpha] __device__(
           auto val) {
           auto v     = thrust::get<0>(val);
@@ -286,8 +276,6 @@ void pagerank(raft::handle_t const& handle,
       CUGRAPH_FAIL("PageRank failed to converge.");
     }
   }
-
-  return;
 }
 
 }  // namespace detail
@@ -295,10 +283,10 @@ void pagerank(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
 void pagerank(raft::handle_t const& handle,
               graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const& graph_view,
-              weight_t* precomputed_vertex_out_weight_sums,
-              vertex_t* personalization_vertices,
-              result_t* personalization_values,
-              vertex_t personalization_vector_size,
+              std::optional<weight_t const*> precomputed_vertex_out_weight_sums,
+              std::optional<vertex_t const*> personalization_vertices,
+              std::optional<result_t const*> personalization_values,
+              std::optional<vertex_t> personalization_vector_size,
               result_t* pageranks,
               result_t alpha,
               result_t epsilon,
@@ -324,10 +312,10 @@ void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int32_t, float, true, true> const& graph_view,
-                       float* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       float* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<float const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<float const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        float* pageranks,
                        float alpha,
                        float epsilon,
@@ -337,10 +325,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int32_t, double, true, true> const& graph_view,
-                       double* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       double* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<double const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<double const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        double* pageranks,
                        double alpha,
                        double epsilon,
@@ -350,10 +338,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int64_t, float, true, true> const& graph_view,
-                       float* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       float* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<float const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<float const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        float* pageranks,
                        float alpha,
                        float epsilon,
@@ -363,10 +351,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int64_t, double, true, true> const& graph_view,
-                       double* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       double* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<double const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<double const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        double* pageranks,
                        double alpha,
                        double epsilon,
@@ -376,10 +364,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int64_t, int64_t, float, true, true> const& graph_view,
-                       float* precomputed_vertex_out_weight_sums,
-                       int64_t* personalization_vertices,
-                       float* personalization_values,
-                       int64_t personalization_vector_size,
+                       std::optional<float const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int64_t const*> personalization_vertices,
+                       std::optional<float const*> personalization_values,
+                       std::optional<int64_t> personalization_vector_size,
                        float* pageranks,
                        float alpha,
                        float epsilon,
@@ -389,10 +377,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int64_t, int64_t, double, true, true> const& graph_view,
-                       double* precomputed_vertex_out_weight_sums,
-                       int64_t* personalization_vertices,
-                       double* personalization_values,
-                       int64_t personalization_vector_size,
+                       std::optional<double const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int64_t const*> personalization_vertices,
+                       std::optional<double const*> personalization_values,
+                       std::optional<int64_t> personalization_vector_size,
                        double* pageranks,
                        double alpha,
                        double epsilon,
@@ -402,10 +390,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int32_t, float, true, false> const& graph_view,
-                       float* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       float* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<float const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<float const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        float* pageranks,
                        float alpha,
                        float epsilon,
@@ -415,10 +403,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int32_t, double, true, false> const& graph_view,
-                       double* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       double* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<double const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<double const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        double* pageranks,
                        double alpha,
                        double epsilon,
@@ -428,10 +416,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int64_t, float, true, false> const& graph_view,
-                       float* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       float* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<float const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<float const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        float* pageranks,
                        float alpha,
                        float epsilon,
@@ -441,10 +429,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int32_t, int64_t, double, true, false> const& graph_view,
-                       double* precomputed_vertex_out_weight_sums,
-                       int32_t* personalization_vertices,
-                       double* personalization_values,
-                       int32_t personalization_vector_size,
+                       std::optional<double const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int32_t const*> personalization_vertices,
+                       std::optional<double const*> personalization_values,
+                       std::optional<int32_t> personalization_vector_size,
                        double* pageranks,
                        double alpha,
                        double epsilon,
@@ -454,10 +442,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int64_t, int64_t, float, true, false> const& graph_view,
-                       float* precomputed_vertex_out_weight_sums,
-                       int64_t* personalization_vertices,
-                       float* personalization_values,
-                       int64_t personalization_vector_size,
+                       std::optional<float const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int64_t const*> personalization_vertices,
+                       std::optional<float const*> personalization_values,
+                       std::optional<int64_t> personalization_vector_size,
                        float* pageranks,
                        float alpha,
                        float epsilon,
@@ -467,10 +455,10 @@ template void pagerank(raft::handle_t const& handle,
 
 template void pagerank(raft::handle_t const& handle,
                        graph_view_t<int64_t, int64_t, double, true, false> const& graph_view,
-                       double* precomputed_vertex_out_weight_sums,
-                       int64_t* personalization_vertices,
-                       double* personalization_values,
-                       int64_t personalization_vector_size,
+                       std::optional<double const*> precomputed_vertex_out_weight_sums,
+                       std::optional<int64_t const*> personalization_vertices,
+                       std::optional<double const*> personalization_values,
+                       std::optional<int64_t> personalization_vector_size,
                        double* pageranks,
                        double alpha,
                        double epsilon,
diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu
new file mode 100644
index 00000000000..caefe0be806
--- /dev/null
+++ b/cpp/src/experimental/relabel.cu
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <cuco/static_map.cuh>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+
+// FIXME: think about requiring old_new_label_pairs to be pre-shuffled
+template <typename vertex_t, bool multi_gpu>
+void relabel(raft::handle_t const& handle,
+             std::tuple<vertex_t const*, vertex_t const*> old_new_label_pairs,
+             vertex_t num_label_pairs,
+             vertex_t* labels /* [INOUT] */,
+             vertex_t num_labels,
+             bool skip_missing_labels,
+             bool do_expensive_check)
+{
+  double constexpr load_factor = 0.7;
+
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    auto key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
+
+    // find unique old labels (to be relabeled)
+
+    rmm::device_uvector<vertex_t> unique_old_labels(num_labels, handle.get_stream_view());
+    thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                 labels,
+                 labels + num_labels,
+                 unique_old_labels.data());
+    thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                 unique_old_labels.begin(),
+                 unique_old_labels.end());
+    unique_old_labels.resize(
+      thrust::distance(unique_old_labels.begin(),
+                       thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                      unique_old_labels.begin(),
+                                      unique_old_labels.end())),
+      handle.get_stream_view());
+    unique_old_labels.shrink_to_fit(handle.get_stream_view());
+
+    // collect new labels for the unique old labels
+
+    rmm::device_uvector<vertex_t> new_labels_for_unique_old_labels(0, handle.get_stream_view());
+    {
+      // shuffle the old_new_label_pairs based on applying the compute_gpu_id_from_vertex_t functor
+      // to the old labels
+
+      rmm::device_uvector<vertex_t> rx_label_pair_old_labels(0, handle.get_stream_view());
+      rmm::device_uvector<vertex_t> rx_label_pair_new_labels(0, handle.get_stream_view());
+      {
+        rmm::device_uvector<vertex_t> label_pair_old_labels(num_label_pairs,
+                                                            handle.get_stream_view());
+        rmm::device_uvector<vertex_t> label_pair_new_labels(num_label_pairs,
+                                                            handle.get_stream_view());
+        thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                     std::get<0>(old_new_label_pairs),
+                     std::get<0>(old_new_label_pairs) + num_label_pairs,
+                     label_pair_old_labels.begin());
+        thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                     std::get<1>(old_new_label_pairs),
+                     std::get<1>(old_new_label_pairs) + num_label_pairs,
+                     label_pair_new_labels.begin());
+        auto pair_first = thrust::make_zip_iterator(
+          thrust::make_tuple(label_pair_old_labels.begin(), label_pair_new_labels.begin()));
+        std::forward_as_tuple(std::tie(rx_label_pair_old_labels, rx_label_pair_new_labels),
+                              std::ignore) =
+          groupby_gpuid_and_shuffle_values(
+            handle.get_comms(),
+            pair_first,
+            pair_first + num_label_pairs,
+            [key_func] __device__(auto val) { return key_func(thrust::get<0>(val)); },
+            handle.get_stream_view());
+      }
+
+      // update intermediate relabel map
+
+      handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+      auto poly_alloc =
+        rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+      auto stream_adapter =
+        rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+      cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+        relabel_map{// cuco::static_map requires at least one empty slot
+                    std::max(static_cast<size_t>(
+                               static_cast<double>(rx_label_pair_old_labels.size()) / load_factor),
+                             rx_label_pair_old_labels.size() + 1),
+                    invalid_vertex_id<vertex_t>::value,
+                    invalid_vertex_id<vertex_t>::value,
+                    stream_adapter};
+
+      auto pair_first = thrust::make_zip_iterator(
+        thrust::make_tuple(rx_label_pair_old_labels.begin(), rx_label_pair_new_labels.begin()));
+      relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size());
+
+      rx_label_pair_old_labels.resize(0, handle.get_stream_view());
+      rx_label_pair_new_labels.resize(0, handle.get_stream_view());
+      rx_label_pair_old_labels.shrink_to_fit(handle.get_stream_view());
+      rx_label_pair_new_labels.shrink_to_fit(handle.get_stream_view());
+
+      // shuffle unique_old_labels, relabel using the intermediate relabel map, and shuffle back
+
+      {
+        rmm::device_uvector<vertex_t> rx_unique_old_labels(0, handle.get_stream_view());
+        std::vector<size_t> rx_value_counts{};
+        std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+          handle.get_comms(),
+          unique_old_labels.begin(),
+          unique_old_labels.end(),
+          [key_func] __device__(auto val) { return key_func(val); },
+          handle.get_stream_view());
+
+        handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+        if (skip_missing_labels) {
+          thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                            rx_unique_old_labels.begin(),
+                            rx_unique_old_labels.end(),
+                            rx_unique_old_labels.begin(),
+                            [view = relabel_map.get_device_view()] __device__(auto old_label) {
+                              auto found = view.find(old_label);
+                              return found != view.end() ? view.find(old_label)->second.load(
+                                                             cuda::std::memory_order_relaxed)
+                                                         : old_label;
+                            });
+        } else {
+          relabel_map.find(
+            rx_unique_old_labels.begin(),
+            rx_unique_old_labels.end(),
+            rx_unique_old_labels.begin());  // now rx_unique_old_lables hold new labels for the
+                                            // corresponding old labels
+        }
+
+        std::tie(new_labels_for_unique_old_labels, std::ignore) =
+          shuffle_values(handle.get_comms(),
+                         rx_unique_old_labels.begin(),
+                         rx_value_counts,
+                         handle.get_stream_view());
+      }
+    }
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    {
+      auto poly_alloc =
+        rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+      auto stream_adapter =
+        rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+      cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+        relabel_map{
+          // cuco::static_map requires at least one empty slot
+          std::max(static_cast<size_t>(static_cast<double>(unique_old_labels.size()) / load_factor),
+                   unique_old_labels.size() + 1),
+          invalid_vertex_id<vertex_t>::value,
+          invalid_vertex_id<vertex_t>::value,
+          stream_adapter};
+
+      auto pair_first = thrust::make_zip_iterator(
+        thrust::make_tuple(unique_old_labels.begin(), new_labels_for_unique_old_labels.begin()));
+      relabel_map.insert(pair_first, pair_first + unique_old_labels.size());
+      relabel_map.find(labels, labels + num_labels, labels);
+    }
+  } else {
+    cuco::static_map<vertex_t, vertex_t> relabel_map(
+      // cuco::static_map requires at least one empty slot
+      std::max(static_cast<size_t>(static_cast<double>(num_label_pairs) / load_factor),
+               static_cast<size_t>(num_label_pairs) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(std::get<0>(old_new_label_pairs), std::get<1>(old_new_label_pairs)));
+    relabel_map.insert(pair_first, pair_first + num_label_pairs);
+    if (skip_missing_labels) {
+      thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                        labels,
+                        labels + num_labels,
+                        labels,
+                        [view = relabel_map.get_device_view()] __device__(auto old_label) {
+                          auto found = view.find(old_label);
+                          return found != view.end() ? view.find(old_label)->second.load(
+                                                         cuda::std::memory_order_relaxed)
+                                                     : old_label;
+                        });
+    } else {
+      relabel_map.find(labels, labels + num_labels, labels);
+    }
+  }
+
+  if (do_expensive_check && !skip_missing_labels) {
+    CUGRAPH_EXPECTS(
+      thrust::count(rmm::exec_policy(handle.get_stream_view()),
+                    labels,
+                    labels + num_labels,
+                    invalid_vertex_id<vertex_t>::value) == 0,
+      "Invalid input argument: labels include old label values missing in old_new_label_pairs.");
+  }
+
+  return;
+}
+
+// explicit instantiation
+
+template void relabel<int32_t, true>(raft::handle_t const& handle,
+                                     std::tuple<int32_t const*, int32_t const*> old_new_label_pairs,
+                                     int32_t num_label_pairs,
+                                     int32_t* labels,
+                                     int32_t num_labels,
+                                     bool skip_missing_labels,
+                                     bool do_expensive_check);
+
+template void relabel<int32_t, false>(
+  raft::handle_t const& handle,
+  std::tuple<int32_t const*, int32_t const*> old_new_label_pairs,
+  int32_t num_label_pairs,
+  int32_t* labels,
+  int32_t num_labels,
+  bool skip_missing_labels,
+  bool do_expensive_check);
+
+template void relabel<int64_t, true>(raft::handle_t const& handle,
+                                     std::tuple<int64_t const*, int64_t const*> old_new_label_pairs,
+                                     int64_t num_label_pairs,
+                                     int64_t* labels,
+                                     int64_t num_labels,
+                                     bool skip_missing_labels,
+                                     bool do_expensive_check);
+
+template void relabel<int64_t, false>(
+  raft::handle_t const& handle,
+  std::tuple<int64_t const*, int64_t const*> old_new_label_pairs,
+  int64_t num_label_pairs,
+  int64_t* labels,
+  int64_t num_labels,
+  bool skip_missing_labels,
+  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu
new file mode 100644
index 00000000000..6dd48c326ec
--- /dev/null
+++ b/cpp/src/experimental/renumber_edgelist.cu
@@ -0,0 +1,940 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_barrier.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <cuco/static_map.cuh>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumber_map(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+  std::vector<vertex_t const*> const& edgelist_major_vertices,
+  std::vector<vertex_t const*> const& edgelist_minor_vertices,
+  std::vector<edge_t> const& edgelist_edge_counts)
+{
+  // FIXME: compare this sort based approach with hash based approach in both speed and memory
+  // footprint
+
+  // 1. acquire (unique major label, count) pairs
+
+  if (multi_gpu) {
+    auto& comm = handle.get_comms();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of col_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    ;
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+
+  rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
+  rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
+  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    rmm::device_uvector<vertex_t> tmp_major_labels(0, handle.get_stream());
+    rmm::device_uvector<edge_t> tmp_major_counts(0, handle.get_stream());
+    {
+      rmm::device_uvector<vertex_t> sorted_major_labels(edgelist_edge_counts[i],
+                                                        handle.get_stream());
+      thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                   edgelist_major_vertices[i],
+                   edgelist_major_vertices[i] + edgelist_edge_counts[i],
+                   sorted_major_labels.begin());
+      // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse
+      thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                   sorted_major_labels.begin(),
+                   sorted_major_labels.end());
+      auto num_unique_labels =
+        thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(sorted_major_labels.size()),
+                         [labels = sorted_major_labels.data()] __device__(auto i) {
+                           return (i == 0) || (labels[i - 1] != labels[i]);
+                         });
+      tmp_major_labels.resize(num_unique_labels, handle.get_stream());
+      tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream());
+      thrust::reduce_by_key(rmm::exec_policy(handle.get_stream_view()),
+                            sorted_major_labels.begin(),
+                            sorted_major_labels.end(),
+                            thrust::make_constant_iterator(edge_t{1}),
+                            tmp_major_labels.begin(),
+                            tmp_major_counts.begin());
+    }
+
+    if (multi_gpu) {
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      rmm::device_uvector<vertex_t> rx_major_labels(0, handle.get_stream());
+      rmm::device_uvector<edge_t> rx_major_counts(0, handle.get_stream());
+      auto rx_sizes = host_scalar_gather(
+        col_comm, tmp_major_labels.size(), static_cast<int>(i), handle.get_stream());
+      std::vector<size_t> rx_displs{};
+      if (static_cast<int>(i) == col_comm_rank) {
+        rx_displs.assign(col_comm_size, size_t{0});
+        std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+        rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream());
+        rx_major_counts.resize(rx_major_labels.size(), handle.get_stream());
+      }
+      device_gatherv(col_comm,
+                     thrust::make_zip_iterator(
+                       thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())),
+                     thrust::make_zip_iterator(
+                       thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())),
+                     tmp_major_labels.size(),
+                     rx_sizes,
+                     rx_displs,
+                     static_cast<int>(i),
+                     handle.get_stream());
+      if (static_cast<int>(i) == col_comm_rank) {
+        major_labels = std::move(rx_major_labels);
+        major_counts = std::move(rx_major_counts);
+      }
+    } else {
+      assert(i == 0);
+      major_labels = std::move(tmp_major_labels);
+      major_counts = std::move(tmp_major_counts);
+    }
+  }
+  if (multi_gpu) {
+    // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse
+    thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()),
+                        major_labels.begin(),
+                        major_labels.end(),
+                        major_counts.begin());
+    auto num_unique_labels = thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                                              thrust::make_counting_iterator(size_t{0}),
+                                              thrust::make_counting_iterator(major_labels.size()),
+                                              [labels = major_labels.data()] __device__(auto i) {
+                                                return (i == 0) || (labels[i - 1] != labels[i]);
+                                              });
+    rmm::device_uvector<vertex_t> tmp_major_labels(num_unique_labels, handle.get_stream());
+    rmm::device_uvector<edge_t> tmp_major_counts(tmp_major_labels.size(), handle.get_stream());
+    thrust::reduce_by_key(rmm::exec_policy(handle.get_stream_view()),
+                          major_labels.begin(),
+                          major_labels.end(),
+                          major_counts.begin(),
+                          tmp_major_labels.begin(),
+                          tmp_major_counts.begin());
+    major_labels = std::move(tmp_major_labels);
+    major_counts = std::move(tmp_major_counts);
+  }
+
+  // 2. acquire unique minor labels
+
+  std::vector<edge_t> minor_displs(edgelist_minor_vertices.size(), edge_t{0});
+  std::partial_sum(
+    edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
+  rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
+                                             handle.get_stream());
+  for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
+    thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                 edgelist_minor_vertices[i],
+                 edgelist_minor_vertices[i] + edgelist_edge_counts[i],
+                 minor_labels.begin() + minor_displs[i]);
+  }
+  thrust::sort(
+    rmm::exec_policy(handle.get_stream_view()), minor_labels.begin(), minor_labels.end());
+  minor_labels.resize(thrust::distance(minor_labels.begin(),
+                                       thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                                      minor_labels.begin(),
+                                                      minor_labels.end())),
+                      handle.get_stream());
+  if (multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (beginning of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+    if (row_comm_size > 1) {
+      rmm::device_uvector<vertex_t> rx_minor_labels(0, handle.get_stream());
+      std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values(
+        row_comm,
+        minor_labels.begin(),
+        minor_labels.end(),
+        [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{row_comm_size}] __device__(
+          auto val) { return key_func(val); },
+        handle.get_stream());
+      thrust::sort(
+        rmm::exec_policy(handle.get_stream_view()), rx_minor_labels.begin(), rx_minor_labels.end());
+      rx_minor_labels.resize(
+        thrust::distance(rx_minor_labels.begin(),
+                         thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                        rx_minor_labels.begin(),
+                                        rx_minor_labels.end())),
+        handle.get_stream());
+      minor_labels = std::move(rx_minor_labels);
+    }
+
+    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
+    // two different communicators (end of row_comm)
+#if 1
+    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
+    // and MPI barrier with MPI)
+    //
+    host_barrier(comm, handle.get_stream_view());
+#else
+    handle.get_stream_view().synchronize();
+    comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  }
+  minor_labels.shrink_to_fit(handle.get_stream_view());
+
+  // 3. merge major and minor labels and vertex labels
+
+  rmm::device_uvector<vertex_t> merged_labels(major_labels.size() + minor_labels.size(),
+                                              handle.get_stream_view());
+  rmm::device_uvector<edge_t> merged_counts(merged_labels.size(), handle.get_stream_view());
+  thrust::merge_by_key(rmm::exec_policy(handle.get_stream_view()),
+                       major_labels.begin(),
+                       major_labels.end(),
+                       minor_labels.begin(),
+                       minor_labels.end(),
+                       major_counts.begin(),
+                       thrust::make_constant_iterator(edge_t{0}),
+                       merged_labels.begin(),
+                       merged_counts.begin());
+
+  major_labels.resize(0, handle.get_stream());
+  major_counts.resize(0, handle.get_stream());
+  minor_labels.resize(0, handle.get_stream());
+  major_labels.shrink_to_fit(handle.get_stream());
+  major_counts.shrink_to_fit(handle.get_stream());
+  minor_labels.shrink_to_fit(handle.get_stream());
+
+  rmm::device_uvector<vertex_t> labels(merged_labels.size(), handle.get_stream());
+  rmm::device_uvector<edge_t> counts(labels.size(), handle.get_stream());
+  auto pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream_view()),
+                                       merged_labels.begin(),
+                                       merged_labels.end(),
+                                       merged_counts.begin(),
+                                       labels.begin(),
+                                       counts.begin());
+  merged_labels.resize(0, handle.get_stream());
+  merged_counts.resize(0, handle.get_stream());
+  merged_labels.shrink_to_fit(handle.get_stream());
+  merged_counts.shrink_to_fit(handle.get_stream());
+  labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream());
+  counts.resize(labels.size(), handle.get_stream());
+  labels.shrink_to_fit(handle.get_stream());
+  counts.shrink_to_fit(handle.get_stream());
+
+  // 4. if vertices != nullptr, add isolated vertices
+
+  rmm::device_uvector<vertex_t> isolated_vertices(0, handle.get_stream());
+  if (optional_vertex_span) {
+    auto [vertices, num_vertices] = *optional_vertex_span;
+    auto num_isolated_vertices    = thrust::count_if(
+      rmm::exec_policy(handle.get_stream_view()),
+      vertices,
+      vertices + num_vertices,
+      [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
+        return !thrust::binary_search(thrust::seq, label_first, label_last, v);
+      });
+    isolated_vertices.resize(num_isolated_vertices, handle.get_stream());
+    thrust::copy_if(rmm::exec_policy(handle.get_stream_view()),
+                    vertices,
+                    vertices + num_vertices,
+                    isolated_vertices.begin(),
+                    [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
+                      return !thrust::binary_search(thrust::seq, label_first, label_last, v);
+                    });
+  }
+
+  if (isolated_vertices.size() > 0) {
+    labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream());
+    counts.resize(labels.size(), handle.get_stream());
+    thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                 isolated_vertices.begin(),
+                 isolated_vertices.end(),
+                 labels.end() - isolated_vertices.size());
+    thrust::fill(rmm::exec_policy(handle.get_stream_view()),
+                 counts.end() - isolated_vertices.size(),
+                 counts.end(),
+                 edge_t{0});
+  }
+
+  // 6. sort by degree
+
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()),
+                      counts.begin(),
+                      counts.end(),
+                      labels.begin(),
+                      thrust::greater<edge_t>());
+
+  // 7. compute segment_offsets
+
+  static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+  static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) &&
+                (detail::mid_degree_threshold <= std::numeric_limits<edge_t>::max()));
+  size_t mid_degree_threshold{detail::mid_degree_threshold};
+  size_t low_degree_threshold{detail::low_degree_threshold};
+  size_t hypersparse_degree_threshold{0};
+  if (multi_gpu) {
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+    mid_degree_threshold *= col_comm_size;
+    low_degree_threshold *= col_comm_size;
+    hypersparse_degree_threshold =
+      static_cast<size_t>(col_comm_size * detail::hypersparse_threshold_ratio);
+  }
+  auto num_segments_per_vertex_partition =
+    detail::num_sparse_segments_per_vertex_partition +
+    (hypersparse_degree_threshold > 0 ? size_t{1} : size_t{0});
+  rmm::device_uvector<edge_t> d_thresholds(num_segments_per_vertex_partition - 1,
+                                           handle.get_stream());
+  auto h_thresholds = hypersparse_degree_threshold > 0
+                        ? std::vector<edge_t>{static_cast<edge_t>(mid_degree_threshold),
+                                              static_cast<edge_t>(low_degree_threshold),
+                                              static_cast<edge_t>(hypersparse_degree_threshold)}
+                        : std::vector<edge_t>{static_cast<edge_t>(mid_degree_threshold),
+                                              static_cast<edge_t>(low_degree_threshold)};
+  raft::update_device(
+    d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream());
+
+  rmm::device_uvector<vertex_t> d_segment_offsets(num_segments_per_vertex_partition + 1,
+                                                  handle.get_stream());
+
+  auto zero_vertex  = vertex_t{0};
+  auto vertex_count = static_cast<vertex_t>(counts.size());
+  d_segment_offsets.set_element_async(0, zero_vertex, handle.get_stream());
+  d_segment_offsets.set_element_async(
+    num_segments_per_vertex_partition, vertex_count, handle.get_stream());
+
+  thrust::upper_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      counts.begin(),
+                      counts.end(),
+                      d_thresholds.begin(),
+                      d_thresholds.end(),
+                      d_segment_offsets.begin() + 1,
+                      thrust::greater<edge_t>{});
+
+  std::vector<vertex_t> h_segment_offsets(d_segment_offsets.size());
+  raft::update_host(h_segment_offsets.data(),
+                    d_segment_offsets.data(),
+                    d_segment_offsets.size(),
+                    handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  return std::make_tuple(std::move(labels), h_segment_offsets);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+void expensive_check_edgelist(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+  std::vector<vertex_t const*> const& edgelist_major_vertices,
+  std::vector<vertex_t const*> const& edgelist_minor_vertices,
+  std::vector<edge_t> const& edgelist_edge_counts)
+{
+  rmm::device_uvector<vertex_t> sorted_local_vertices(size_t{0}, handle.get_stream());
+  if (optional_vertex_span) {
+    auto [vertices, num_vertices] = *optional_vertex_span;
+    sorted_local_vertices.resize(num_vertices, handle.get_stream());
+    thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                 vertices,
+                 vertices + num_vertices,
+                 sorted_local_vertices.begin());
+    thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                 sorted_local_vertices.begin(),
+                 sorted_local_vertices.end());
+    CUGRAPH_EXPECTS(static_cast<size_t>(thrust::distance(
+                      sorted_local_vertices.begin(),
+                      thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                     sorted_local_vertices.begin(),
+                                     sorted_local_vertices.end()))) == sorted_local_vertices.size(),
+                    "Invalid input argument: local_vertices should not have duplicates.");
+  }
+
+  if (multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto const comm_size     = comm.get_size();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    auto const row_comm_rank = row_comm.get_rank();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+    auto const col_comm_rank = col_comm.get_rank();
+
+    CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) &&
+                      (edgelist_major_vertices.size() == static_cast<size_t>(col_comm_size)),
+                    "Invalid input argument: both edgelist_major_vertices.size() & "
+                    "edgelist_minor_vertices.size() should coincide with col_comm_size.");
+
+    auto [local_vertices, num_local_vertices] = *optional_vertex_span;
+    CUGRAPH_EXPECTS(
+      thrust::count_if(
+        rmm::exec_policy(handle.get_stream_view()),
+        local_vertices,
+        local_vertices + num_local_vertices,
+        [comm_rank,
+         key_func =
+           detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
+          return key_func(val) != comm_rank;
+        }) == 0,
+      "Invalid input argument: local_vertices should be pre-shuffled.");
+
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i]));
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          rmm::exec_policy(handle.get_stream_view()),
+          edge_first,
+          edge_first + edgelist_edge_counts[i],
+          [comm_size,
+           comm_rank,
+           row_comm_rank,
+           col_comm_size,
+           col_comm_rank,
+           i,
+           gpu_id_key_func =
+             detail::compute_gpu_id_from_edge_t<vertex_t>{comm_size, row_comm_size, col_comm_size},
+           partition_id_key_func =
+             detail::compute_partition_id_from_edge_t<vertex_t>{
+               comm_size, row_comm_size, col_comm_size}] __device__(auto edge) {
+            return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) ||
+                   (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) !=
+                    row_comm_rank * col_comm_size + col_comm_rank + i * comm_size);
+          }) == 0,
+        "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
+        "pre-shuffled.");
+
+      if (optional_vertex_span) {
+        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+        auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+
+        // FIXME: this barrier is unnecessary if the above host_scalar_allreduce is a true host
+        // operation (as it serves as a barrier) barrier is necessary here to avoid potential
+        // overlap (which can leads to deadlock) between two different communicators (beginning of
+        // col_comm)
+#if 1
+        // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
+        // DASK and MPI barrier with MPI)
+        host_barrier(comm, handle.get_stream_view());
+#else
+        handle.get_stream_view().synchronize();
+        comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+        rmm::device_uvector<vertex_t> sorted_major_vertices(0, handle.get_stream());
+        {
+          auto recvcounts =
+            host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream());
+          std::vector<size_t> displacements(recvcounts.size(), size_t{0});
+          std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+          sorted_major_vertices.resize(displacements.back() + recvcounts.back(),
+                                       handle.get_stream());
+          device_allgatherv(col_comm,
+                            sorted_local_vertices.data(),
+                            sorted_major_vertices.data(),
+                            recvcounts,
+                            displacements,
+                            handle.get_stream());
+          thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                       sorted_major_vertices.begin(),
+                       sorted_major_vertices.end());
+        }
+
+        // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
+        // between two different communicators (beginning of row_comm)
+#if 1
+        // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
+        // DASK and MPI barrier with MPI)
+        host_barrier(comm, handle.get_stream_view());
+#else
+        handle.get_stream_view().synchronize();
+        comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+        rmm::device_uvector<vertex_t> sorted_minor_vertices(0, handle.get_stream());
+        {
+          auto recvcounts =
+            host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream());
+          std::vector<size_t> displacements(recvcounts.size(), size_t{0});
+          std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+          sorted_minor_vertices.resize(displacements.back() + recvcounts.back(),
+                                       handle.get_stream());
+          device_allgatherv(row_comm,
+                            sorted_local_vertices.data(),
+                            sorted_minor_vertices.data(),
+                            recvcounts,
+                            displacements,
+                            handle.get_stream());
+          thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                       sorted_minor_vertices.begin(),
+                       sorted_minor_vertices.end());
+        }
+
+        // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
+        // between two different communicators (end of row_comm)
+#if 1
+        // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
+        // DASK and MPI barrier with MPI)
+        host_barrier(comm, handle.get_stream_view());
+#else
+        handle.get_stream_view().synchronize();
+        comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+        auto edge_first = thrust::make_zip_iterator(
+          thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i]));
+        CUGRAPH_EXPECTS(
+          thrust::count_if(
+            rmm::exec_policy(handle.get_stream_view()),
+            edge_first,
+            edge_first + edgelist_edge_counts[i],
+            [num_major_vertices    = static_cast<vertex_t>(sorted_major_vertices.size()),
+             sorted_major_vertices = sorted_major_vertices.data(),
+             num_minor_vertices    = static_cast<vertex_t>(sorted_minor_vertices.size()),
+             sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) {
+              return !thrust::binary_search(thrust::seq,
+                                            sorted_major_vertices,
+                                            sorted_major_vertices + num_major_vertices,
+                                            thrust::get<0>(e)) ||
+                     !thrust::binary_search(thrust::seq,
+                                            sorted_minor_vertices,
+                                            sorted_minor_vertices + num_minor_vertices,
+                                            thrust::get<1>(e));
+            }) == 0,
+          "Invalid input argument: edgelist_major_vertices and/or edgelist_mior_vertices have "
+          "invalid vertex ID(s).");
+      }
+    }
+  } else {
+    assert(edgelist_major_vertices.size() == 1);
+    assert(edgelist_minor_vertices.size() == 1);
+
+    if (optional_vertex_span) {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0]));
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          rmm::exec_policy(handle.get_stream_view()),
+          edge_first,
+          edge_first + edgelist_edge_counts[0],
+          [sorted_local_vertices = sorted_local_vertices.data(),
+           num_sorted_local_vertices =
+             static_cast<vertex_t>(sorted_local_vertices.size())] __device__(auto e) {
+            return !thrust::binary_search(thrust::seq,
+                                          sorted_local_vertices,
+                                          sorted_local_vertices + num_sorted_local_vertices,
+                                          thrust::get<0>(e)) ||
+                   !thrust::binary_search(thrust::seq,
+                                          sorted_local_vertices,
+                                          sorted_local_vertices + num_sorted_local_vertices,
+                                          thrust::get<1>(e));
+          }) == 0,
+        "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have "
+        "invalid vertex ID(s).");
+    }
+  }
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>,
+                            partition_t<vertex_t>,
+                            vertex_t,
+                            edge_t,
+                            std::vector<vertex_t>>>
+renumber_edgelist(raft::handle_t const& handle,
+                  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_local_vertex_span,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
+                  bool do_expensive_check)
+{
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto const row_comm_rank = row_comm.get_rank();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+  auto const col_comm_rank = col_comm.get_rank();
+
+  std::vector<vertex_t const*> edgelist_const_major_vertices(edgelist_major_vertices.size());
+  std::vector<vertex_t const*> edgelist_const_minor_vertices(edgelist_const_major_vertices.size());
+  for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) {
+    edgelist_const_major_vertices[i] = edgelist_major_vertices[i];
+    edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i];
+  }
+
+  if (do_expensive_check) {
+    expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                          optional_local_vertex_span,
+                                                          edgelist_const_major_vertices,
+                                                          edgelist_const_minor_vertices,
+                                                          edgelist_edge_counts);
+  }
+
+  // 1. compute renumber map
+
+  auto [renumber_map_labels, segment_offsets] =
+    detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
+                                                              optional_local_vertex_span,
+                                                              edgelist_const_major_vertices,
+                                                              edgelist_const_minor_vertices,
+                                                              edgelist_edge_counts);
+  // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
+  // graph
+
+  auto vertex_counts = host_scalar_allgather(
+    comm, static_cast<vertex_t>(renumber_map_labels.size()), handle.get_stream());
+  std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
+  std::partial_sum(
+    vertex_counts.begin(), vertex_counts.end(), vertex_partition_offsets.begin() + 1);
+
+  partition_t<vertex_t> partition(
+    vertex_partition_offsets, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank);
+
+  auto number_of_vertices = vertex_partition_offsets.back();
+  auto number_of_edges    = host_scalar_allreduce(
+    comm,
+    std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}),
+    handle.get_stream());
+
+  // 3. renumber edges
+
+  double constexpr load_factor = 0.7;
+
+  // FIXME: compare this hash based approach with a binary search based approach in both memory
+  // footprint and execution time
+
+  // FIXME: this barrier is unnecessary if the above host_scalar_allgather is a true host operation
+  // (as it serves as a barrier) barrier is necessary here to avoid potential overlap (which can
+  // leads to deadlock) between two different communicators (beginning of col_comm)
+#if 1
+  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
+  // MPI barrier with MPI)
+  host_barrier(comm, handle.get_stream_view());
+#else
+  handle.get_stream_view().synchronize();
+  comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    rmm::device_uvector<vertex_t> renumber_map_major_labels(
+      col_comm_rank == static_cast<int>(i) ? vertex_t{0}
+                                           : partition.get_matrix_partition_major_size(i),
+      handle.get_stream());
+    device_bcast(col_comm,
+                 renumber_map_labels.data(),
+                 renumber_map_major_labels.data(),
+                 partition.get_matrix_partition_major_size(i),
+                 i,
+                 handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // cuco::static_map currently does not take stream
+
+    auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+    cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+      renumber_map{
+        // cuco::static_map requires at least one empty slot
+        std::max(static_cast<size_t>(
+                   static_cast<double>(partition.get_matrix_partition_major_size(i)) / load_factor),
+                 static_cast<size_t>(partition.get_matrix_partition_major_size(i)) + 1),
+        invalid_vertex_id<vertex_t>::value,
+        invalid_vertex_id<vertex_t>::value,
+        stream_adapter};
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      col_comm_rank == static_cast<int>(i) ? renumber_map_labels.begin()
+                                           : renumber_map_major_labels.begin(),
+      thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i))));
+    renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i));
+    renumber_map.find(edgelist_major_vertices[i],
+                      edgelist_major_vertices[i] + edgelist_edge_counts[i],
+                      edgelist_major_vertices[i]);
+  }
+
+  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
+  // different communicators (beginning of row_comm)
+#if 1
+  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
+  // MPI barrier with MPI)
+  host_barrier(comm, handle.get_stream_view());
+#else
+  handle.get_stream_view().synchronize();
+  comm.barrier();  // currently, this is ncclAllReduce
+#endif
+  {
+    rmm::device_uvector<vertex_t> renumber_map_minor_labels(
+      partition.get_matrix_partition_minor_size(), handle.get_stream());
+    std::vector<size_t> recvcounts(row_comm_size);
+    for (int i = 0; i < row_comm_size; ++i) {
+      recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+    }
+    std::vector<size_t> displacements(recvcounts.size(), 0);
+    std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+    device_allgatherv(row_comm,
+                      renumber_map_labels.begin(),
+                      renumber_map_minor_labels.begin(),
+                      recvcounts,
+                      displacements,
+                      handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // cuco::static_map currently does not take stream
+
+    auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+    cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+      renumber_map{// cuco::static_map requires at least one empty slot
+                   std::max(static_cast<size_t>(
+                              static_cast<double>(renumber_map_minor_labels.size()) / load_factor),
+                            renumber_map_minor_labels.size() + 1),
+                   invalid_vertex_id<vertex_t>::value,
+                   invalid_vertex_id<vertex_t>::value,
+                   stream_adapter};
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      renumber_map_minor_labels.begin(),
+      thrust::make_counting_iterator(partition.get_matrix_partition_minor_first())));
+    renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size());
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      renumber_map.find(edgelist_minor_vertices[i],
+                        edgelist_minor_vertices[i] + edgelist_edge_counts[i],
+                        edgelist_minor_vertices[i]);
+    }
+  }
+  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
+  // different communicators (end of row_comm)
+#if 1
+  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
+  // MPI barrier with MPI)
+  host_barrier(comm, handle.get_stream_view());
+#else
+  handle.get_stream_view().synchronize();
+  comm.barrier();  // currently, this is ncclAllReduce
+#endif
+
+  return std::make_tuple(std::move(renumber_map_labels),
+                         partition,
+                         number_of_vertices,
+                         number_of_edges,
+                         segment_offsets);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>>>
+renumber_edgelist(raft::handle_t const& handle,
+                  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool do_expensive_check)
+{
+  if (do_expensive_check) {
+    expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      optional_vertex_span,
+      std::vector<vertex_t const*>{edgelist_major_vertices},
+      std::vector<vertex_t const*>{edgelist_minor_vertices},
+      std::vector<edge_t>{num_edgelist_edges});
+  }
+
+  auto [renumber_map_labels, segment_offsets] =
+    detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(
+      handle,
+      optional_vertex_span,
+      std::vector<vertex_t const*>{edgelist_major_vertices},
+      std::vector<vertex_t const*>{edgelist_minor_vertices},
+      std::vector<edge_t>{num_edgelist_edges});
+
+  double constexpr load_factor = 0.7;
+
+  // FIXME: compare this hash based approach with a binary search based approach in both memory
+  // footprint and execution time
+
+  auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+  auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+  cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+    renumber_map{
+      // cuco::static_map requires at least one empty slot
+      std::max(static_cast<size_t>(static_cast<double>(renumber_map_labels.size()) / load_factor),
+               renumber_map_labels.size() + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value,
+      stream_adapter};
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(renumber_map_labels.begin(), thrust::make_counting_iterator(vertex_t{0})));
+  renumber_map.insert(pair_first, pair_first + renumber_map_labels.size());
+  renumber_map.find(
+    edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices);
+  renumber_map.find(
+    edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices);
+
+  return std::make_tuple(std::move(renumber_map_labels), segment_offsets);
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>,
+                            partition_t<vertex_t>,
+                            vertex_t,
+                            edge_t,
+                            std::vector<vertex_t>>>
+renumber_edgelist(raft::handle_t const& handle,
+                  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_local_vertex_span,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
+                  bool do_expensive_check)
+{
+  return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                                optional_local_vertex_span,
+                                                                edgelist_major_vertices,
+                                                                edgelist_minor_vertices,
+                                                                edgelist_edge_counts,
+                                                                do_expensive_check);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>>>
+renumber_edgelist(raft::handle_t const& handle,
+                  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool do_expensive_check)
+{
+  return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                                optional_vertex_span,
+                                                                edgelist_major_vertices,
+                                                                edgelist_minor_vertices,
+                                                                num_edgelist_edges,
+                                                                do_expensive_check);
+}
+
+// explicit instantiation directives (EIDir's):
+//
+
+// instantiations for <vertex_t == int32_t, edge_t == int32_t>
+//
+template std::
+  tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t, std::vector<int32_t>>
+  renumber_edgelist<int32_t, int32_t, true>(
+    raft::handle_t const& handle,
+    std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
+    std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+    std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+    std::vector<int32_t> const& edgelist_edge_counts,
+    bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, std::vector<int32_t>>
+renumber_edgelist<int32_t, int32_t, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  int32_t* edgelist_major_vertices /* [INOUT] */,
+  int32_t* edgelist_minor_vertices /* [INOUT] */,
+  int32_t num_edgelist_edges,
+  bool do_expensive_check);
+
+// instantiations for <vertex_t == int32_t, edge_t == int64_t>
+//
+template std::
+  tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t, std::vector<int32_t>>
+  renumber_edgelist<int32_t, int64_t, true>(
+    raft::handle_t const& handle,
+    std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
+    std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+    std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+    std::vector<int64_t> const& edgelist_edge_counts,
+    bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, std::vector<int32_t>>
+renumber_edgelist<int32_t, int64_t, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  int32_t* edgelist_major_vertices /* [INOUT] */,
+  int32_t* edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool do_expensive_check);
+
+// instantiations for <vertex_t == int64_t, edge_t == int64_t>
+//
+template std::
+  tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t, std::vector<int64_t>>
+  renumber_edgelist<int64_t, int64_t, true>(
+    raft::handle_t const& handle,
+    std::optional<std::tuple<int64_t const*, int64_t>> optional_local_vertex_span,
+    std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
+    std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
+    std::vector<int64_t> const& edgelist_edge_counts,
+    bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>, std::vector<int64_t>>
+renumber_edgelist<int64_t, int64_t, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  int64_t* edgelist_major_vertices /* [INOUT] */,
+  int64_t* edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu
new file mode 100644
index 00000000000..dc2d44a139a
--- /dev/null
+++ b/cpp/src/experimental/renumber_utils.cu
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/utilities/collect_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+
+#include <cuco/static_map.cuh>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t, bool multi_gpu>
+void renumber_ext_vertices(raft::handle_t const& handle,
+                           vertex_t* vertices /* [INOUT] */,
+                           size_t num_vertices,
+                           vertex_t const* renumber_map_labels,
+                           vertex_t local_int_vertex_first,
+                           vertex_t local_int_vertex_last,
+                           bool do_expensive_check)
+{
+  double constexpr load_factor = 0.7;
+
+  if (do_expensive_check) {
+    rmm::device_uvector<vertex_t> labels(local_int_vertex_last - local_int_vertex_first,
+                                         handle.get_stream_view());
+    thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                 renumber_map_labels,
+                 renumber_map_labels + labels.size(),
+                 labels.begin());
+    thrust::sort(rmm::exec_policy(handle.get_stream_view()), labels.begin(), labels.end());
+    CUGRAPH_EXPECTS(
+      thrust::unique(rmm::exec_policy(handle.get_stream_view()), labels.begin(), labels.end()) ==
+        labels.end(),
+      "Invalid input arguments: renumber_map_labels have duplicate elements.");
+  }
+
+  auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+  auto stream_adapter   = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+  auto renumber_map_ptr = std::make_unique<
+    cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+    size_t{0},
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value,
+    stream_adapter);
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    rmm::device_uvector<vertex_t> sorted_unique_ext_vertices(num_vertices,
+                                                             handle.get_stream_view());
+    sorted_unique_ext_vertices.resize(
+      thrust::distance(
+        sorted_unique_ext_vertices.begin(),
+        thrust::copy_if(rmm::exec_policy(handle.get_stream_view()),
+                        vertices,
+                        vertices + num_vertices,
+                        sorted_unique_ext_vertices.begin(),
+                        [] __device__(auto v) { return v != invalid_vertex_id<vertex_t>::value; })),
+      handle.get_stream_view());
+    thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                 sorted_unique_ext_vertices.begin(),
+                 sorted_unique_ext_vertices.end());
+    sorted_unique_ext_vertices.resize(
+      thrust::distance(sorted_unique_ext_vertices.begin(),
+                       thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                      sorted_unique_ext_vertices.begin(),
+                                      sorted_unique_ext_vertices.end())),
+      handle.get_stream_view());
+
+    auto int_vertices_for_sorted_unique_ext_vertices = collect_values_for_unique_keys(
+      comm,
+      renumber_map_labels,
+      renumber_map_labels + (local_int_vertex_last - local_int_vertex_first),
+      thrust::make_counting_iterator(local_int_vertex_first),
+      sorted_unique_ext_vertices.begin(),
+      sorted_unique_ext_vertices.end(),
+      detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
+      handle.get_stream_view());
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    renumber_map_ptr.reset();
+
+    renumber_map_ptr = std::make_unique<
+      cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+      // cuco::static_map requires at least one empty slot
+      std::max(
+        static_cast<size_t>(static_cast<double>(sorted_unique_ext_vertices.size()) / load_factor),
+        sorted_unique_ext_vertices.size() + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value,
+      stream_adapter);
+
+    auto kv_pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      sorted_unique_ext_vertices.begin(), int_vertices_for_sorted_unique_ext_vertices.begin()));
+    renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size());
+  } else {
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    renumber_map_ptr.reset();
+
+    renumber_map_ptr = std::make_unique<
+      cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
+      // cuco::static_map requires at least one empty slot
+      std::max(static_cast<size_t>(
+                 static_cast<double>(local_int_vertex_last - local_int_vertex_first) / load_factor),
+               static_cast<size_t>(local_int_vertex_last - local_int_vertex_first) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value,
+      stream_adapter);
+
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(renumber_map_labels, thrust::make_counting_iterator(vertex_t{0})));
+    renumber_map_ptr->insert(pair_first,
+                             pair_first + (local_int_vertex_last - local_int_vertex_first));
+  }
+
+  if (do_expensive_check) {
+    rmm::device_uvector<bool> contains(num_vertices, handle.get_stream_view());
+    renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin());
+    auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin()));
+    CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                                     vc_pair_first,
+                                     vc_pair_first + num_vertices,
+                                     [] __device__(auto pair) {
+                                       auto v = thrust::get<0>(pair);
+                                       auto c = thrust::get<1>(pair);
+                                       return v == invalid_vertex_id<vertex_t>::value
+                                                ? (c == true)
+                                                : (c == false);
+                                     }) == 0,
+                    "Invalid input arguments: vertices have elements that are missing in "
+                    "(aggregate) renumber_map_labels.");
+  }
+
+  renumber_map_ptr->find(vertices, vertices + num_vertices, vertices);
+}
+
+template <typename vertex_t>
+void unrenumber_local_int_vertices(
+  raft::handle_t const& handle,
+  vertex_t* vertices /* [INOUT] */,
+  size_t num_vertices,
+  vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */,
+  vertex_t local_int_vertex_first,
+  vertex_t local_int_vertex_last,
+  bool do_expensive_check)
+{
+  if (do_expensive_check) {
+    CUGRAPH_EXPECTS(
+      thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                       vertices,
+                       vertices + num_vertices,
+                       [local_int_vertex_first, local_int_vertex_last] __device__(auto v) {
+                         return v != invalid_vertex_id<vertex_t>::value &&
+                                (v < local_int_vertex_first || v >= local_int_vertex_last);
+                       }) == 0,
+      "Invalid input arguments: there are non-local vertices in [vertices, vertices "
+      "+ num_vertices).");
+  }
+
+  thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                    vertices,
+                    vertices + num_vertices,
+                    vertices,
+                    [renumber_map_labels, local_int_vertex_first] __device__(auto v) {
+                      return v == invalid_vertex_id<vertex_t>::value
+                               ? v
+                               : renumber_map_labels[v - local_int_vertex_first];
+                    });
+}
+
+template <typename vertex_t, bool multi_gpu>
+void unrenumber_int_vertices(raft::handle_t const& handle,
+                             vertex_t* vertices /* [INOUT] */,
+                             size_t num_vertices,
+                             vertex_t const* renumber_map_labels,
+                             vertex_t local_int_vertex_first,
+                             vertex_t local_int_vertex_last,
+                             std::vector<vertex_t> const& vertex_partition_lasts,
+                             bool do_expensive_check)
+{
+  double constexpr load_factor = 0.7;
+
+  if (do_expensive_check) {
+    CUGRAPH_EXPECTS(
+      thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                       vertices,
+                       vertices + num_vertices,
+                       [int_vertex_last = vertex_partition_lasts.back()] __device__(auto v) {
+                         return v != invalid_vertex_id<vertex_t>::value &&
+                                !is_valid_vertex(int_vertex_last, v);
+                       }) == 0,
+      "Invalid input arguments: there are out-of-range vertices in [vertices, vertices "
+      "+ num_vertices).");
+  }
+
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    rmm::device_uvector<vertex_t> sorted_unique_int_vertices(num_vertices,
+                                                             handle.get_stream_view());
+    sorted_unique_int_vertices.resize(
+      thrust::distance(
+        sorted_unique_int_vertices.begin(),
+        thrust::copy_if(rmm::exec_policy(handle.get_stream_view()),
+                        vertices,
+                        vertices + num_vertices,
+                        sorted_unique_int_vertices.begin(),
+                        [] __device__(auto v) { return v != invalid_vertex_id<vertex_t>::value; })),
+      handle.get_stream_view());
+    thrust::sort(rmm::exec_policy(handle.get_stream_view()),
+                 sorted_unique_int_vertices.begin(),
+                 sorted_unique_int_vertices.end());
+    sorted_unique_int_vertices.resize(
+      thrust::distance(sorted_unique_int_vertices.begin(),
+                       thrust::unique(rmm::exec_policy(handle.get_stream_view()),
+                                      sorted_unique_int_vertices.begin(),
+                                      sorted_unique_int_vertices.end())),
+      handle.get_stream_view());
+
+    rmm::device_uvector<vertex_t> d_vertex_partition_lasts(vertex_partition_lasts.size(),
+                                                           handle.get_stream_view());
+    raft::update_device(d_vertex_partition_lasts.data(),
+                        vertex_partition_lasts.data(),
+                        vertex_partition_lasts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<size_t> d_tx_int_vertex_offsets(d_vertex_partition_lasts.size(),
+                                                        handle.get_stream_view());
+    thrust::lower_bound(rmm::exec_policy(handle.get_stream_view()),
+                        sorted_unique_int_vertices.begin(),
+                        sorted_unique_int_vertices.end(),
+                        d_vertex_partition_lasts.begin(),
+                        d_vertex_partition_lasts.end(),
+                        d_tx_int_vertex_offsets.begin());
+    std::vector<size_t> h_tx_int_vertex_counts(d_tx_int_vertex_offsets.size());
+    raft::update_host(h_tx_int_vertex_counts.data(),
+                      d_tx_int_vertex_offsets.data(),
+                      d_tx_int_vertex_offsets.size(),
+                      handle.get_stream());
+    handle.get_stream_view().synchronize();
+    std::adjacent_difference(
+      h_tx_int_vertex_counts.begin(), h_tx_int_vertex_counts.end(), h_tx_int_vertex_counts.begin());
+
+    rmm::device_uvector<vertex_t> rx_int_vertices(0, handle.get_stream_view());
+    std::vector<size_t> rx_int_vertex_counts{};
+    std::tie(rx_int_vertices, rx_int_vertex_counts) = shuffle_values(
+      comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream_view());
+
+    auto tx_ext_vertices = std::move(rx_int_vertices);
+    thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                      tx_ext_vertices.begin(),
+                      tx_ext_vertices.end(),
+                      tx_ext_vertices.begin(),
+                      [renumber_map_labels, local_int_vertex_first] __device__(auto v) {
+                        return renumber_map_labels[v - local_int_vertex_first];
+                      });
+
+    rmm::device_uvector<vertex_t> rx_ext_vertices_for_sorted_unique_int_vertices(
+      0, handle.get_stream_view());
+    std::tie(rx_ext_vertices_for_sorted_unique_int_vertices, std::ignore) =
+      shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream_view());
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+    cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+      unrenumber_map{
+        // cuco::static_map requires at least one empty slot
+        std::max(
+          static_cast<size_t>(static_cast<double>(sorted_unique_int_vertices.size()) / load_factor),
+          sorted_unique_int_vertices.size() + 1),
+        invalid_vertex_id<vertex_t>::value,
+        invalid_vertex_id<vertex_t>::value,
+        stream_adapter};
+
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+      sorted_unique_int_vertices.begin(), rx_ext_vertices_for_sorted_unique_int_vertices.begin()));
+    unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size());
+    unrenumber_map.find(vertices, vertices + num_vertices, vertices);
+  } else {
+    unrenumber_local_int_vertices(handle,
+                                  vertices,
+                                  num_vertices,
+                                  renumber_map_labels,
+                                  local_int_vertex_first,
+                                  local_int_vertex_last,
+                                  do_expensive_check);
+  }
+}
+
+// explicit instantiation
+
+template void renumber_ext_vertices<int32_t, false>(raft::handle_t const& handle,
+                                                    int32_t* vertices,
+                                                    size_t num_vertices,
+                                                    int32_t const* renumber_map_labels,
+                                                    int32_t local_int_vertex_first,
+                                                    int32_t local_int_vertex_last,
+                                                    bool do_expensive_check);
+
+template void renumber_ext_vertices<int32_t, true>(raft::handle_t const& handle,
+                                                   int32_t* vertices,
+                                                   size_t num_vertices,
+                                                   int32_t const* renumber_map_labels,
+                                                   int32_t local_int_vertex_first,
+                                                   int32_t local_int_vertex_last,
+                                                   bool do_expensive_check);
+
+template void renumber_ext_vertices<int64_t, false>(raft::handle_t const& handle,
+                                                    int64_t* vertices,
+                                                    size_t num_vertices,
+                                                    int64_t const* renumber_map_labels,
+                                                    int64_t local_int_vertex_first,
+                                                    int64_t local_int_vertex_last,
+                                                    bool do_expensive_check);
+
+template void renumber_ext_vertices<int64_t, true>(raft::handle_t const& handle,
+                                                   int64_t* vertices,
+                                                   size_t num_vertices,
+                                                   int64_t const* renumber_map_labels,
+                                                   int64_t local_int_vertex_first,
+                                                   int64_t local_int_vertex_last,
+                                                   bool do_expensive_check);
+
+template void unrenumber_local_int_vertices<int32_t>(raft::handle_t const& handle,
+                                                     int32_t* vertices,
+                                                     size_t num_vertices,
+                                                     int32_t const* renumber_map_labels,
+                                                     int32_t local_int_vertex_first,
+                                                     int32_t local_int_vertex_last,
+                                                     bool do_expensive_check);
+
+template void unrenumber_local_int_vertices<int64_t>(raft::handle_t const& handle,
+                                                     int64_t* vertices,
+                                                     size_t num_vertices,
+                                                     int64_t const* renumber_map_labels,
+                                                     int64_t local_int_vertex_first,
+                                                     int64_t local_int_vertex_last,
+                                                     bool do_expensive_check);
+
+template void unrenumber_int_vertices<int32_t, false>(
+  raft::handle_t const& handle,
+  int32_t* vertices,
+  size_t num_vertices,
+  int32_t const* renumber_map_labels,
+  int32_t local_int_vertex_first,
+  int32_t local_int_vertex_last,
+  std::vector<int32_t> const& vertex_partition_lasts,
+  bool do_expensive_check);
+
+template void unrenumber_int_vertices<int32_t, true>(
+  raft::handle_t const& handle,
+  int32_t* vertices,
+  size_t num_vertices,
+  int32_t const* renumber_map_labels,
+  int32_t local_int_vertex_first,
+  int32_t local_int_vertex_last,
+  std::vector<int32_t> const& vertex_partition_lasts,
+  bool do_expensive_check);
+
+template void unrenumber_int_vertices<int64_t, false>(
+  raft::handle_t const& handle,
+  int64_t* vertices,
+  size_t num_vertices,
+  int64_t const* renumber_map_labels,
+  int64_t local_int_vertex_first,
+  int64_t local_int_vertex_last,
+  std::vector<int64_t> const& vertex_partition_lasts,
+  bool do_expensive_check);
+
+template void unrenumber_int_vertices<int64_t, true>(
+  raft::handle_t const& handle,
+  int64_t* vertices,
+  size_t num_vertices,
+  int64_t const* renumber_map_labels,
+  int64_t local_int_vertex_first,
+  int64_t local_int_vertex_last,
+  std::vector<int64_t> const& vertex_partition_lasts,
+  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/scramble.cuh b/cpp/src/experimental/scramble.cuh
new file mode 100644
index 00000000000..875bb5feff0
--- /dev/null
+++ b/cpp/src/experimental/scramble.cuh
@@ -0,0 +1,82 @@
+/* Copyright (C) 2009-2010 The Trustees of Indiana University.             */
+/*                                                                         */
+/* Use, modification and distribution is subject to the Boost Software     */
+/* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at */
+/* http://www.boost.org/LICENSE_1_0.txt)                                   */
+/*                                                                         */
+/*  Authors: Jeremiah Willcock                                             */
+/*           Andrew Lumsdaine                                              */
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename uvertex_t>
+__device__ std::enable_if_t<sizeof(uvertex_t) == 8, uvertex_t> bitreversal(uvertex_t value)
+{
+  return __brevll(value);
+}
+
+template <typename uvertex_t>
+__device__ std::enable_if_t<sizeof(uvertex_t) == 4, uvertex_t> bitreversal(uvertex_t value)
+{
+  return __brev(value);
+}
+
+template <typename uvertex_t>
+__device__ std::enable_if_t<sizeof(uvertex_t) == 2, uvertex_t> bitreversal(uvertex_t value)
+{
+  return static_cast<uvertex_t>(__brev(value) >> 16);
+}
+
+/* Apply a permutation to scramble vertex numbers; a randomly generated
+ * permutation is not used because applying it at scale is too expensive. */
+template <typename vertex_t>
+__device__ vertex_t scramble(vertex_t value, size_t lgN)
+{
+  constexpr size_t number_of_bits = sizeof(vertex_t) * 8;
+
+  static_assert((number_of_bits == 64) || (number_of_bits == 32) || (number_of_bits == 16));
+  assert((std::is_unsigned<vertex_t>::value && lgN <= number_of_bits) ||
+         (!std::is_unsigned<vertex_t>::value && lgN < number_of_bits));
+  assert(value >= 0);
+
+  using uvertex_t = typename std::make_unsigned<vertex_t>::type;
+
+  constexpr auto scramble_value0 = static_cast<uvertex_t>(
+    sizeof(vertex_t) == 8 ? 606610977102444280 : (sizeof(vertex_t) == 4 ? 282475248 : 0));
+  constexpr auto scramble_value1 = static_cast<uvertex_t>(
+    sizeof(vertex_t) == 8 ? 11680327234415193037 : (sizeof(vertex_t) == 4 ? 2617694917 : 8620));
+
+  auto v = static_cast<uvertex_t>(value);
+  v += scramble_value0 + scramble_value1;
+  v *= (scramble_value0 | static_cast<uvertex_t>(0x4519840211493211));
+  v = bitreversal(v) >> (number_of_bits - lgN);
+  v *= (scramble_value1 | static_cast<uvertex_t>(0x3050852102C843A5));
+  v = bitreversal(v) >> (number_of_bits - lgN);
+  return static_cast<vertex_t>(v);
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/shuffle.cuh b/cpp/src/experimental/shuffle.cuh
deleted file mode 100644
index 40f3b510b10..00000000000
--- a/cpp/src/experimental/shuffle.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/comms/comms.hpp>
-#include <raft/device_atomics.cuh>
-
-namespace cugraph {
-namespace experimental {
-
-namespace detail {
-
-//
-// FIXME:   This implementation of variable_shuffle stages the data for transfer
-//          in host memory.  It would be more efficient, I believe, to stage the
-//          data in device memory, but it would require actually instantiating
-//          the data in device memory which is already precious in the Louvain
-//          implementation.  We should explore if it's actually more efficient
-//          through device memory and whether the improvement is worth the extra
-//          memory required.
-//
-template <typename data_t, typename iterator_t, typename partition_iter_t>
-rmm::device_vector<data_t> variable_shuffle(raft::handle_t const &handle,
-                                            std::size_t n_elements,
-                                            iterator_t data_iter,
-                                            partition_iter_t partition_iter)
-{
-  //
-  // We need to compute the size of data movement
-  //
-  raft::comms::comms_t const &comms = handle.get_comms();
-
-  cudaStream_t stream = handle.get_stream();
-  int num_gpus        = comms.get_size();
-  int my_gpu          = comms.get_rank();
-
-  rmm::device_vector<size_t> local_sizes_v(num_gpus, size_t{0});
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   partition_iter,
-                   partition_iter + n_elements,
-                   [num_gpus, d_local_sizes = local_sizes_v.data().get()] __device__(auto p) {
-                     atomicAdd(d_local_sizes + p, size_t{1});
-                   });
-
-  std::vector<size_t> h_local_sizes_v(num_gpus);
-  std::vector<size_t> h_global_sizes_v(num_gpus);
-  std::vector<data_t> h_input_v(n_elements);
-  std::vector<int32_t> h_partitions_v(n_elements);
-
-  thrust::copy(local_sizes_v.begin(), local_sizes_v.end(), h_local_sizes_v.begin());
-  thrust::copy(partition_iter, partition_iter + n_elements, h_partitions_v.begin());
-
-  std::vector<raft::comms::request_t> requests(2 * num_gpus);
-
-  int request_pos = 0;
-
-  for (int gpu = 0; gpu < num_gpus; ++gpu) {
-    if (gpu != my_gpu) {
-      comms.irecv(&h_global_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]);
-      ++request_pos;
-      comms.isend(&h_local_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]);
-      ++request_pos;
-    } else {
-      h_global_sizes_v[gpu] = h_local_sizes_v[gpu];
-    }
-  }
-
-  if (request_pos > 0) { comms.waitall(request_pos, requests.data()); }
-
-  comms.barrier();
-
-  //
-  //  Now global_sizes contains all of the counts, we need to
-  //  allocate an array of the appropriate size
-  //
-  int64_t receive_size =
-    thrust::reduce(thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end());
-
-  std::vector<data_t> temp_data;
-
-  if (receive_size > 0) temp_data.resize(receive_size);
-
-  rmm::device_vector<data_t> input_v(n_elements);
-
-  auto input_start = input_v.begin();
-
-  for (int gpu = 0; gpu < num_gpus; ++gpu) {
-    input_start = thrust::copy_if(rmm::exec_policy(stream)->on(stream),
-                                  data_iter,
-                                  data_iter + n_elements,
-                                  partition_iter,
-                                  input_start,
-                                  [gpu] __device__(int32_t p) { return p == gpu; });
-  }
-
-  thrust::copy(input_v.begin(), input_v.end(), h_input_v.begin());
-
-  std::vector<size_t> temp_v(num_gpus + 1);
-
-  thrust::exclusive_scan(
-    thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end(), temp_v.begin());
-
-  temp_v[num_gpus] = temp_v[num_gpus - 1] + h_global_sizes_v[num_gpus - 1];
-  h_global_sizes_v = temp_v;
-
-  thrust::exclusive_scan(
-    thrust::host, h_local_sizes_v.begin(), h_local_sizes_v.end(), temp_v.begin());
-
-  temp_v[num_gpus] = temp_v[num_gpus - 1] + h_local_sizes_v[num_gpus - 1];
-  h_local_sizes_v  = temp_v;
-
-  CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-  comms.barrier();
-
-  request_pos = 0;
-
-  for (int gpu = 0; gpu < num_gpus; ++gpu) {
-    size_t to_receive = h_global_sizes_v[gpu + 1] - h_global_sizes_v[gpu];
-    size_t to_send    = h_local_sizes_v[gpu + 1] - h_local_sizes_v[gpu];
-
-    if (gpu != my_gpu) {
-      if (to_receive > 0) {
-        comms.irecv(
-          temp_data.data() + h_global_sizes_v[gpu], to_receive, gpu, 0, &requests[request_pos]);
-        ++request_pos;
-      }
-
-      if (to_send > 0) {
-        comms.isend(
-          h_input_v.data() + h_local_sizes_v[gpu], to_send, gpu, 0, &requests[request_pos]);
-        ++request_pos;
-      }
-    } else if (to_receive > 0) {
-      std::copy(h_input_v.begin() + h_local_sizes_v[gpu],
-                h_input_v.begin() + h_local_sizes_v[gpu + 1],
-                temp_data.begin() + h_global_sizes_v[gpu]);
-    }
-  }
-
-  comms.barrier();
-
-  if (request_pos > 0) { comms.waitall(request_pos, requests.data()); }
-
-  comms.barrier();
-
-  return rmm::device_vector<data_t>(temp_data);
-}
-
-}  // namespace detail
-
-/**
- * @brief shuffle data to the desired partition
- *
- * MNMG algorithms require shuffling data between partitions
- * to get the data to the right location for computation.
- *
- * This function operates dynamically, there is no
- * a priori knowledge about where the data will need
- * to be transferred.
- *
- * This function will be executed on each GPU.  Each gpu
- * has a portion of the data (specified by begin_data and
- * end_data iterators) and an iterator that identifies
- * (for each corresponding element) which GPU the data
- * should be shuffled to.
- *
- * The return value will be a device vector containing
- * the data received by this GPU.
- *
- * Note that this function accepts iterators as input.
- * `partition_iterator` will be traversed multiple times.
- *
- * @tparam is_multi_gpu     If true, multi-gpu - shuffle will occur
- *                          If false, single GPU - simple copy will occur
- * @tparam data_t           Type of the data being shuffled
- * @tparam iterator_t       Iterator referencing data to be shuffled
- * @tparam partition_iter_t Iterator identifying the destination partition
- *
- * @param  handle         Library handle (RAFT)
- * @param  n_elements     Number of elements to transfer
- * @param  data_iter      Iterator that returns the elements to be transfered
- * @param  partition_iter Iterator that returns the partition where elements
- *                        should be transfered.
- */
-template <bool is_multi_gpu,
-          typename data_t,
-          typename iterator_t,
-          typename partition_iter_t,
-          typename std::enable_if_t<is_multi_gpu> * = nullptr>
-rmm::device_vector<data_t> variable_shuffle(raft::handle_t const &handle,
-                                            std::size_t n_elements,
-                                            iterator_t data_iter,
-                                            partition_iter_t partition_iter)
-{
-  return detail::variable_shuffle<data_t>(handle, n_elements, data_iter, partition_iter);
-}
-
-template <bool is_multi_gpu,
-          typename data_t,
-          typename iterator_t,
-          typename partition_iter_t,
-          typename std::enable_if_t<!is_multi_gpu> * = nullptr>
-rmm::device_vector<data_t> variable_shuffle(raft::handle_t const &handle,
-                                            std::size_t n_elements,
-                                            iterator_t data_iter,
-                                            partition_iter_t partition_iter)
-{
-  return rmm::device_vector<data_t>(data_iter, data_iter + n_elements);
-}
-
-}  // namespace experimental
-}  // namespace cugraph
diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu
index ebcde1b1444..b6f85ddb3c2 100644
--- a/cpp/src/experimental/sssp.cu
+++ b/cpp/src/experimental/sssp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include <algorithms.hpp>
-#include <experimental/graph_view.hpp>
-#include <patterns/copy_to_adj_matrix_row_col.cuh>
-#include <patterns/count_if_e.cuh>
-#include <patterns/reduce_op.cuh>
-#include <patterns/transform_reduce_e.cuh>
-#include <patterns/update_frontier_v_push_if_out_nbr.cuh>
-#include <patterns/vertex_frontier.cuh>
-#include <utilities/error.hpp>
-#include <vertex_partition_device.cuh>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/count_if_e.cuh>
+#include <cugraph/prims/reduce_op.cuh>
+#include <cugraph/prims/transform_reduce_e.cuh>
+#include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
+#include <cugraph/prims/vertex_frontier.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
 
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
@@ -32,6 +32,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -42,9 +43,9 @@ namespace experimental {
 namespace detail {
 
 template <typename GraphViewType, typename PredecessorIterator>
-void sssp(raft::handle_t const &handle,
-          GraphViewType const &push_graph_view,
-          typename GraphViewType::weight_type *distances,
+void sssp(raft::handle_t const& handle,
+          GraphViewType const& push_graph_view,
+          typename GraphViewType::weight_type* distances,
           PredecessorIterator predecessor_first,
           typename GraphViewType::vertex_type source_vertex,
           typename GraphViewType::weight_type cutoff,
@@ -70,6 +71,9 @@ void sssp(raft::handle_t const &handle,
 
   CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex),
                   "Invalid input argument: source vertex out-of-range.");
+  CUGRAPH_EXPECTS(push_graph_view.is_weighted(),
+                  "Invalid input argument: an unweighted graph is passed to SSSP, BFS is more "
+                  "efficient for unweighted graphs.");
 
   if (do_expensive_check) {
     auto num_negative_edge_weights =
@@ -122,15 +126,12 @@ void sssp(raft::handle_t const &handle,
 
   // 4. initialize SSSP frontier
 
-  enum class Bucket { cur_near, new_near, far, num_buckets };
-  // FIXME: need to double check the bucket sizes are sufficient
-  std::vector<size_t> bucket_sizes(static_cast<size_t>(Bucket::num_buckets),
-                                   push_graph_view.get_number_of_local_vertices());
-  VertexFrontier<thrust::tuple<weight_t, vertex_t>,
-                 vertex_t,
+  enum class Bucket { cur_near, next_near, far, num_buckets };
+  VertexFrontier<vertex_t,
+                 void,
                  GraphViewType::is_multi_gpu,
                  static_cast<size_t>(Bucket::num_buckets)>
-    vertex_frontier(handle, bucket_sizes);
+    vertex_frontier(handle);
 
   // 5. SSSP iteration
 
@@ -167,13 +168,15 @@ void sssp(raft::handle_t const &handle,
         row_distances);
     }
 
-    vertex_partition_device_t<GraphViewType> vertex_partition(push_graph_view);
+    auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+      push_graph_view.get_vertex_partition_view());
 
     update_frontier_v_push_if_out_nbr(
       handle,
       push_graph_view,
-      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
-      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
+      vertex_frontier,
+      static_cast<size_t>(Bucket::cur_near),
+      std::vector<size_t>{static_cast<size_t>(Bucket::next_near), static_cast<size_t>(Bucket::far)},
       row_distances,
       thrust::make_constant_iterator(0) /* dummy */,
       [vertex_partition, distances, cutoff] __device__(
@@ -188,58 +191,62 @@ void sssp(raft::handle_t const &handle,
           threshold         = old_distance < threshold ? old_distance : threshold;
         }
         if (new_distance >= threshold) { push = false; }
-        return thrust::make_tuple(push, new_distance, src);
+        return push ? thrust::optional<thrust::tuple<weight_t, vertex_t>>{thrust::make_tuple(
+                        new_distance, src)}
+                    : thrust::nullopt;
       },
       reduce_op::min<thrust::tuple<weight_t, vertex_t>>(),
       distances,
       thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)),
-      vertex_frontier,
-      [near_far_threshold] __device__(auto v_val, auto pushed_val) {
+      [near_far_threshold] __device__(auto v, auto v_val, auto pushed_val) {
         auto new_dist = thrust::get<0>(pushed_val);
         auto idx      = new_dist < v_val
-                     ? (new_dist < near_far_threshold ? static_cast<size_t>(Bucket::new_near)
-                                                      : static_cast<size_t>(Bucket::far))
-                     : VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
-        return thrust::make_tuple(idx, thrust::get<0>(pushed_val), thrust::get<1>(pushed_val));
+                          ? (new_dist < near_far_threshold ? static_cast<size_t>(Bucket::next_near)
+                                                           : static_cast<size_t>(Bucket::far))
+                          : VertexFrontier<vertex_t>::kInvalidBucketIdx;
+        return new_dist < v_val
+                 ? thrust::optional<thrust::tuple<size_t, decltype(pushed_val)>>{thrust::make_tuple(
+                     static_cast<size_t>(new_dist < near_far_threshold ? Bucket::next_near
+                                                                       : Bucket::far),
+                     pushed_val)}
+                 : thrust::nullopt;
       });
 
     vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).clear();
-    if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::new_near)).aggregate_size() > 0) {
+    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).shrink_to_fit();
+    if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::next_near)).aggregate_size() > 0) {
       vertex_frontier.swap_buckets(static_cast<size_t>(Bucket::cur_near),
-                                   static_cast<size_t>(Bucket::new_near));
+                                   static_cast<size_t>(Bucket::next_near));
     } else if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::far)).aggregate_size() >
                0) {  // near queue is empty, split the far queue
       auto old_near_far_threshold = near_far_threshold;
       near_far_threshold += delta;
 
-      size_t new_near_size{0};
-      size_t new_far_size{0};
+      size_t near_size{0};
+      size_t far_size{0};
       while (true) {
         vertex_frontier.split_bucket(
           static_cast<size_t>(Bucket::far),
+          std::vector<size_t>{static_cast<size_t>(Bucket::cur_near)},
           [vertex_partition, distances, old_near_far_threshold, near_far_threshold] __device__(
             auto v) {
             auto dist =
               *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v));
-            if (dist < old_near_far_threshold) {
-              return VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
-            } else if (dist < near_far_threshold) {
-              return static_cast<size_t>(Bucket::cur_near);
-            } else {
-              return static_cast<size_t>(Bucket::far);
-            }
+            return dist >= old_near_far_threshold
+                     ? thrust::optional<size_t>{static_cast<size_t>(
+                         dist < near_far_threshold ? Bucket::cur_near : Bucket::far)}
+                     : thrust::nullopt;
           });
-        new_near_size =
+        near_size =
           vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).aggregate_size();
-        new_far_size =
-          vertex_frontier.get_bucket(static_cast<size_t>(Bucket::far)).aggregate_size();
-        if ((new_near_size > 0) || (new_far_size == 0)) {
+        far_size = vertex_frontier.get_bucket(static_cast<size_t>(Bucket::far)).aggregate_size();
+        if ((near_size > 0) || (far_size == 0)) {
           break;
         } else {
           near_far_threshold += delta;
         }
       }
-      if ((new_near_size == 0) && (new_far_size == 0)) { break; }
+      if ((near_size == 0) && (far_size == 0)) { break; }
     } else {
       break;
     }
@@ -249,17 +256,15 @@ void sssp(raft::handle_t const &handle,
     handle.get_stream()));  // this is as necessary vertex_frontier will become out-of-scope once
                             // this function returns (FIXME: should I stream sync in VertexFrontier
                             // destructor?)
-
-  return;
 }
 
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void sssp(raft::handle_t const &handle,
-          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
-          weight_t *distances,
-          vertex_t *predecessors,
+void sssp(raft::handle_t const& handle,
+          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+          weight_t* distances,
+          vertex_t* predecessors,
           vertex_t source_vertex,
           weight_t cutoff,
           bool do_expensive_check)
@@ -280,98 +285,98 @@ void sssp(raft::handle_t const &handle,
 
 // explicit instantiation
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int32_t, float, false, true> const &graph_view,
-                   float *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
+                   float* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    float cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int32_t, double, false, true> const &graph_view,
-                   double *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
+                   double* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    double cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int64_t, float, false, true> const &graph_view,
-                   float *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
+                   float* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    float cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int64_t, double, false, true> const &graph_view,
-                   double *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
+                   double* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    double cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int64_t, int64_t, float, false, true> const &graph_view,
-                   float *distances,
-                   int64_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
+                   float* distances,
+                   int64_t* predecessors,
                    int64_t source_vertex,
                    float cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int64_t, int64_t, double, false, true> const &graph_view,
-                   double *distances,
-                   int64_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
+                   double* distances,
+                   int64_t* predecessors,
                    int64_t source_vertex,
                    double cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
-                   float *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+                   float* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    float cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int32_t, double, false, false> const &graph_view,
-                   double *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
+                   double* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    double cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int64_t, float, false, false> const &graph_view,
-                   float *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
+                   float* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    float cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int32_t, int64_t, double, false, false> const &graph_view,
-                   double *distances,
-                   int32_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
+                   double* distances,
+                   int32_t* predecessors,
                    int32_t source_vertex,
                    double cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int64_t, int64_t, float, false, false> const &graph_view,
-                   float *distances,
-                   int64_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
+                   float* distances,
+                   int64_t* predecessors,
                    int64_t source_vertex,
                    float cutoff,
                    bool do_expensive_check);
 
-template void sssp(raft::handle_t const &handle,
-                   graph_view_t<int64_t, int64_t, double, false, false> const &graph_view,
-                   double *distances,
-                   int64_t *predecessors,
+template void sssp(raft::handle_t const& handle,
+                   graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
+                   double* distances,
+                   int64_t* predecessors,
                    int64_t source_vertex,
                    double cutoff,
                    bool do_expensive_check);
diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
new file mode 100644
index 00000000000..8452a613174
--- /dev/null
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+
+namespace cugraph {
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
+                                        vertex_t num_vertices,
+                                        float p,
+                                        vertex_t base_vertex_id,
+                                        uint64_t seed)
+{
+  CUGRAPH_EXPECTS(num_vertices < std::numeric_limits<int32_t>::max(),
+                  "Implementation cannot support specified value");
+
+  auto random_iterator = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
+      thrust::default_random_engine rng(seed);
+      thrust::uniform_real_distribution<float> dist(0.0, 1.0);
+      rng.discard(index);
+      return dist(rng);
+    });
+
+  size_t count = thrust::count_if(rmm::exec_policy(handle.get_stream()),
+                                  random_iterator,
+                                  random_iterator + num_vertices * num_vertices,
+                                  [p] __device__(float prob) { return prob < p; });
+
+  rmm::device_uvector<size_t> indices_v(count, handle.get_stream());
+
+  thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                  random_iterator,
+                  random_iterator + num_vertices * num_vertices,
+                  indices_v.begin(),
+                  [p] __device__(float prob) { return prob < p; });
+
+  rmm::device_uvector<vertex_t> src_v(count, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(count, handle.get_stream());
+
+  thrust::transform(rmm::exec_policy(handle.get_stream()),
+                    indices_v.begin(),
+                    indices_v.end(),
+                    thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
+                    [num_vertices] __device__(size_t index) {
+                      size_t src = index / num_vertices;
+                      size_t dst = index % num_vertices;
+
+                      return thrust::make_tuple(static_cast<vertex_t>(src),
+                                                static_cast<vertex_t>(dst));
+                    });
+
+  handle.get_stream_view().synchronize();
+
+  return std::make_tuple(std::move(src_v), std::move(dst_v));
+}
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle,
+                                        vertex_t num_vertices,
+                                        size_t m,
+                                        vertex_t base_vertex_id,
+                                        uint64_t seed)
+{
+  CUGRAPH_FAIL("Not implemented");
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
+                                        int32_t num_vertices,
+                                        float p,
+                                        int32_t base_vertex_id,
+                                        uint64_t seed);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
+                                        int64_t num_vertices,
+                                        float p,
+                                        int64_t base_vertex_id,
+                                        uint64_t seed);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle,
+                                        int32_t num_vertices,
+                                        size_t m,
+                                        int32_t base_vertex_id,
+                                        uint64_t seed);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle,
+                                        int64_t num_vertices,
+                                        size_t m,
+                                        int64_t base_vertex_id,
+                                        uint64_t seed);
+
+}  // namespace cugraph
diff --git a/cpp/src/generators/generate_rmat_edgelist.cu b/cpp/src/generators/generate_rmat_edgelist.cu
new file mode 100644
index 00000000000..c7d8a5682bc
--- /dev/null
+++ b/cpp/src/generators/generate_rmat_edgelist.cu
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <random>
+#include <rmm/detail/error.hpp>
+#include <tuple>
+
+namespace cugraph {
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> generate_rmat_edgelist(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t num_edges,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool clip_and_flip)
+{
+  CUGRAPH_EXPECTS((size_t{1} << scale) <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+                  "Invalid input argument: scale too large for vertex_t.");
+  CUGRAPH_EXPECTS((a >= 0.0) && (b >= 0.0) && (c >= 0.0) && (a + b + c <= 1.0),
+                  "Invalid input argument: a, b, c should be non-negative and a + b + c should not "
+                  "be larger than 1.0.");
+
+  // to limit memory footprint (1024 is a tuning parameter)
+  auto max_edges_to_generate_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * 1024;
+  rmm::device_uvector<float> rands(
+    std::min(num_edges, max_edges_to_generate_per_iteration) * 2 * scale, handle.get_stream_view());
+
+  rmm::device_uvector<vertex_t> srcs(num_edges, handle.get_stream_view());
+  rmm::device_uvector<vertex_t> dsts(num_edges, handle.get_stream_view());
+
+  size_t num_edges_generated{0};
+  while (num_edges_generated < num_edges) {
+    auto num_edges_to_generate =
+      std::min(num_edges - num_edges_generated, max_edges_to_generate_per_iteration);
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(srcs.begin(), dsts.begin())) +
+                      num_edges_generated;
+
+    detail::uniform_random_fill(
+      handle.get_stream_view(), rands.data(), num_edges_to_generate * 2 * scale, 0.0f, 1.0f, seed);
+    seed += num_edges_to_generate * 2 * scale;
+
+    thrust::transform(
+      rmm::exec_policy(handle.get_stream_view()),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_edges_to_generate),
+      pair_first,
+      // if a + b == 0.0, a_norm is irrelevant, if (1.0 - (a+b)) == 0.0, c_norm is irrelevant
+      [scale,
+       clip_and_flip,
+       rands    = rands.data(),
+       a_plus_b = a + b,
+       a_norm   = (a + b) > 0.0 ? a / (a + b) : 0.0,
+       c_norm   = (1.0 - (a + b)) > 0.0 ? c / (1.0 - (a + b)) : 0.0] __device__(auto i) {
+        vertex_t src{0};
+        vertex_t dst{0};
+        for (size_t bit = scale - 1; bit != 0; --bit) {
+          auto r0          = rands[i * 2 * scale + 2 * bit];
+          auto r1          = rands[i * 2 * scale + 2 * bit + 1];
+          auto src_bit_set = r0 > a_plus_b;
+          auto dst_bit_set = r1 > (src_bit_set ? c_norm : a_norm);
+          if (clip_and_flip) {
+            if (src == dst) {
+              if (!src_bit_set && dst_bit_set) {
+                src_bit_set = !src_bit_set;
+                dst_bit_set = !dst_bit_set;
+              }
+            }
+          }
+          src += src_bit_set ? static_cast<vertex_t>(1 << bit) : 0;
+          dst += dst_bit_set ? static_cast<vertex_t>(1 << bit) : 0;
+        }
+        return thrust::make_tuple(src, dst);
+      });
+    num_edges_generated += num_edges_to_generate;
+  }
+
+  return std::make_tuple(std::move(srcs), std::move(dsts));
+}
+
+template <typename vertex_t>
+std::vector<std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>>
+generate_rmat_edgelists(raft::handle_t const& handle,
+                        size_t n_edgelists,
+                        size_t min_scale,
+                        size_t max_scale,
+                        size_t edge_factor,
+                        generator_distribution_t component_distribution,
+                        generator_distribution_t edge_distribution,
+                        uint64_t seed,
+                        bool clip_and_flip)
+{
+  CUGRAPH_EXPECTS(min_scale > 0, "minimum graph scale is 1.");
+  CUGRAPH_EXPECTS(
+    size_t{1} << max_scale <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+    "Invalid input argument: scale too large for vertex_t.");
+
+  std::vector<std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>> output{};
+  output.reserve(n_edgelists);
+  std::vector<vertex_t> scale(n_edgelists);
+
+  std::default_random_engine eng;
+  eng.seed(seed);
+  if (component_distribution == generator_distribution_t::UNIFORM) {
+    std::uniform_int_distribution<vertex_t> dist(min_scale, max_scale);
+    std::generate(scale.begin(), scale.end(), [&dist, &eng]() { return dist(eng); });
+  } else {
+    // May expose this as a parameter in the future
+    std::exponential_distribution<float> dist(4);
+    // The modulo is here to protect the range because exponential distribution is defined on
+    // [0,infinity). With exponent 4 most values are between 0 and 1
+    auto range = max_scale - min_scale;
+    std::generate(scale.begin(), scale.end(), [&dist, &eng, &min_scale, &range]() {
+      return min_scale + static_cast<vertex_t>(static_cast<float>(range) * dist(eng)) % range;
+    });
+  }
+
+  // intialized to standard powerlaw values
+  double a = 0.57, b = 0.19, c = 0.19;
+  if (edge_distribution == generator_distribution_t::UNIFORM) {
+    a = 0.25;
+    b = a;
+    c = a;
+  }
+
+  for (size_t i = 0; i < n_edgelists; i++) {
+    output.push_back(generate_rmat_edgelist<vertex_t>(
+      handle, scale[i], scale[i] * edge_factor, a, b, c, i, clip_and_flip));
+  }
+  return output;
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_rmat_edgelist<int32_t>(raft::handle_t const& handle,
+                                size_t scale,
+                                size_t num_edges,
+                                double a,
+                                double b,
+                                double c,
+                                uint64_t seed,
+                                bool clip_and_flip);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_rmat_edgelist<int64_t>(raft::handle_t const& handle,
+                                size_t scale,
+                                size_t num_edges,
+                                double a,
+                                double b,
+                                double c,
+                                uint64_t seed,
+                                bool clip_and_flip);
+
+template std::vector<std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
+generate_rmat_edgelists<int32_t>(raft::handle_t const& handle,
+                                 size_t n_edgelists,
+                                 size_t min_scale,
+                                 size_t max_scale,
+                                 size_t edge_factor,
+                                 generator_distribution_t component_distribution,
+                                 generator_distribution_t edge_distribution,
+                                 uint64_t seed,
+                                 bool clip_and_flip);
+
+template std::vector<std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
+generate_rmat_edgelists<int64_t>(raft::handle_t const& handle,
+                                 size_t n_edgelists,
+                                 size_t min_scale,
+                                 size_t max_scale,
+                                 size_t edge_factor,
+                                 generator_distribution_t component_distribution,
+                                 generator_distribution_t edge_distribution,
+                                 uint64_t seed,
+                                 bool clip_and_flip);
+
+}  // namespace cugraph
diff --git a/cpp/src/generators/generator_tools.cu b/cpp/src/generators/generator_tools.cu
new file mode 100644
index 00000000000..375d96b0d99
--- /dev/null
+++ b/cpp/src/generators/generator_tools.cu
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <experimental/scramble.cuh>
+
+#include <raft/cuda_utils.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+
+#include <numeric>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename T>
+rmm::device_uvector<T> append_all(raft::handle_t const& handle,
+                                  std::vector<rmm::device_uvector<T>>&& input)
+{
+  size_t size{0};
+  // for (size_t i = 0; i < input.size(); ++i) size += input[i].size();
+  for (auto& element : input)
+    size += element.size();
+
+  rmm::device_uvector<T> output(size, handle.get_stream());
+  auto output_iter = output.begin();
+
+  for (auto& element : input) {
+    raft::copy(output_iter, element.begin(), element.size(), handle.get_stream());
+    output_iter += element.size();
+  }
+
+  /*
+for (size_t i = 0; i < input.size(); ++i) {
+  raft::copy(output_iter, input[i].begin(), input[i].size(), handle.get_stream());
+  output_iter += input[i].size();
+}
+  */
+
+  return output;
+}
+
+}  // namespace detail
+
+template <typename vertex_t>
+void scramble_vertex_ids(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& d_src_v,
+                         rmm::device_uvector<vertex_t>& d_dst_v,
+                         vertex_t vertex_id_offset,
+                         uint64_t seed)
+{
+  vertex_t scale = 1 + raft::log2(d_src_v.size());
+
+  auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
+  thrust::transform(rmm::exec_policy(handle.get_stream()),
+                    pair_first,
+                    pair_first + d_src_v.size(),
+                    pair_first,
+                    [scale] __device__(auto pair) {
+                      return thrust::make_tuple(
+                        experimental::detail::scramble(thrust::get<0>(pair), scale),
+                        experimental::detail::scramble(thrust::get<1>(pair), scale));
+                    });
+}
+
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+combine_edgelists(raft::handle_t const& handle,
+                  std::vector<rmm::device_uvector<vertex_t>>&& sources,
+                  std::vector<rmm::device_uvector<vertex_t>>&& dests,
+                  std::optional<std::vector<rmm::device_uvector<weight_t>>>&& optional_d_weights,
+                  bool remove_multi_edges)
+{
+  CUGRAPH_EXPECTS(sources.size() == dests.size(),
+                  "sources and dests vertex lists must be the same size");
+
+  if (optional_d_weights) {
+    CUGRAPH_EXPECTS(sources.size() == optional_d_weights.value().size(),
+                    "has_weights is specified, sources and weights must be the same size");
+
+    thrust::for_each_n(
+      thrust::host,
+      thrust::make_zip_iterator(
+        thrust::make_tuple(sources.begin(), dests.begin(), optional_d_weights.value().begin())),
+      sources.size(),
+      [](auto tuple) {
+        CUGRAPH_EXPECTS(thrust::get<0>(tuple).size() != thrust::get<1>(tuple).size(),
+                        "source vertex and dest vertex uvectors must be same size");
+        CUGRAPH_EXPECTS(thrust::get<0>(tuple).size() != thrust::get<2>(tuple).size(),
+                        "source vertex and weights uvectors must be same size");
+      });
+  } else {
+    thrust::for_each_n(
+      thrust::host,
+      thrust::make_zip_iterator(thrust::make_tuple(sources.begin(), dests.begin())),
+      sources.size(),
+      [](auto tuple) {
+        CUGRAPH_EXPECTS(thrust::get<0>(tuple).size() == thrust::get<1>(tuple).size(),
+                        "source vertex and dest vertex uvectors must be same size");
+      });
+  }
+
+  std::vector<rmm::device_uvector<weight_t>> d_weights;
+
+  rmm::device_uvector<vertex_t> srcs_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dsts_v(0, handle.get_stream());
+  rmm::device_uvector<weight_t> weights_v(0, handle.get_stream());
+
+  srcs_v = detail::append_all<vertex_t>(handle, std::move(sources));
+  dsts_v = detail::append_all<vertex_t>(handle, std::move(dests));
+
+  if (optional_d_weights) {
+    weights_v = detail::append_all(handle, std::move(optional_d_weights.value()));
+  }
+
+  if (remove_multi_edges) {
+    size_t number_of_edges{srcs_v.size()};
+
+    if (optional_d_weights) {
+      thrust::sort(
+        rmm::exec_policy(handle.get_stream()),
+        thrust::make_zip_iterator(
+          thrust::make_tuple(srcs_v.begin(), dsts_v.begin(), weights_v.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(srcs_v.end(), dsts_v.end(), weights_v.end())));
+
+      auto pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(srcs_v.begin(), dsts_v.begin()));
+      auto end_iter = thrust::unique_by_key(rmm::exec_policy(handle.get_stream()),
+                                            pair_first,
+                                            pair_first + srcs_v.size(),
+                                            weights_v.begin());
+
+      number_of_edges = thrust::distance(pair_first, thrust::get<0>(end_iter));
+    } else {
+      thrust::sort(rmm::exec_policy(handle.get_stream()),
+                   thrust::make_zip_iterator(thrust::make_tuple(srcs_v.begin(), dsts_v.begin())),
+                   thrust::make_zip_iterator(thrust::make_tuple(srcs_v.end(), dsts_v.end())));
+
+      auto pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(srcs_v.begin(), dsts_v.begin()));
+
+      auto end_iter = thrust::unique(
+        rmm::exec_policy(handle.get_stream()),
+        thrust::make_zip_iterator(thrust::make_tuple(srcs_v.begin(), dsts_v.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(srcs_v.end(), dsts_v.end())));
+
+      number_of_edges = thrust::distance(pair_first, end_iter);
+    }
+
+    srcs_v.resize(number_of_edges, handle.get_stream());
+    srcs_v.shrink_to_fit(handle.get_stream());
+    dsts_v.resize(number_of_edges, handle.get_stream());
+    dsts_v.shrink_to_fit(handle.get_stream());
+
+    if (optional_d_weights) {
+      weights_v.resize(number_of_edges, handle.get_stream());
+      weights_v.shrink_to_fit(handle.get_stream());
+    }
+  }
+
+  return std::make_tuple(
+    std::move(srcs_v),
+    std::move(dsts_v),
+    optional_d_weights
+      ? std::move(std::optional<rmm::device_uvector<weight_t>>(std::move(weights_v)))
+      : std::nullopt);
+}
+
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+symmetrize_edgelist(raft::handle_t const& handle,
+                    rmm::device_uvector<vertex_t>&& d_src_v,
+                    rmm::device_uvector<vertex_t>&& d_dst_v,
+                    std::optional<rmm::device_uvector<weight_t>>&& optional_d_weights_v)
+{
+  auto offset = d_src_v.size();
+  d_src_v.resize(offset * 2, handle.get_stream_view());
+  d_dst_v.resize(offset * 2, handle.get_stream_view());
+
+  thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+               d_dst_v.begin(),
+               d_dst_v.begin() + offset,
+               d_src_v.begin() + offset);
+  thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+               d_src_v.begin(),
+               d_src_v.begin() + offset,
+               d_dst_v.begin() + offset);
+  if (optional_d_weights_v) {
+    optional_d_weights_v->resize(d_src_v.size(), handle.get_stream_view());
+    thrust::copy(rmm::exec_policy(handle.get_stream_view()),
+                 optional_d_weights_v->begin(),
+                 optional_d_weights_v->begin() + offset,
+                 optional_d_weights_v->begin() + offset);
+  }
+
+  return std::make_tuple(std::move(d_src_v),
+                         std::move(d_dst_v),
+                         optional_d_weights_v ? std::move(optional_d_weights_v) : std::nullopt);
+}
+
+template void scramble_vertex_ids(raft::handle_t const& handle,
+                                  rmm::device_uvector<int32_t>& d_src_v,
+                                  rmm::device_uvector<int32_t>& d_dst_v,
+                                  int32_t vertex_id_offset,
+                                  uint64_t seed);
+
+template void scramble_vertex_ids(raft::handle_t const& handle,
+                                  rmm::device_uvector<int64_t>& d_src_v,
+                                  rmm::device_uvector<int64_t>& d_dst_v,
+                                  int64_t vertex_id_offset,
+                                  uint64_t seed);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+combine_edgelists(raft::handle_t const& handle,
+                  std::vector<rmm::device_uvector<int32_t>>&& sources,
+                  std::vector<rmm::device_uvector<int32_t>>&& dests,
+                  std::optional<std::vector<rmm::device_uvector<float>>>&& optional_d_weights,
+                  bool remove_multi_edges);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+combine_edgelists(raft::handle_t const& handle,
+                  std::vector<rmm::device_uvector<int64_t>>&& sources,
+                  std::vector<rmm::device_uvector<int64_t>>&& dests,
+                  std::optional<std::vector<rmm::device_uvector<float>>>&& optional_d_weights,
+                  bool remove_multi_edges);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+combine_edgelists(raft::handle_t const& handle,
+                  std::vector<rmm::device_uvector<int32_t>>&& sources,
+                  std::vector<rmm::device_uvector<int32_t>>&& dests,
+                  std::optional<std::vector<rmm::device_uvector<double>>>&& optional_d_weights,
+                  bool remove_multi_edges);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+combine_edgelists(raft::handle_t const& handle,
+                  std::vector<rmm::device_uvector<int64_t>>&& sources,
+                  std::vector<rmm::device_uvector<int64_t>>&& dests,
+                  std::optional<std::vector<rmm::device_uvector<double>>>&& optional_d_weights,
+                  bool remove_multi_edges);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+symmetrize_edgelist(raft::handle_t const& handle,
+                    rmm::device_uvector<int32_t>&& d_src_v,
+                    rmm::device_uvector<int32_t>&& d_dst_v,
+                    std::optional<rmm::device_uvector<float>>&& optional_d_weights_v);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+symmetrize_edgelist(raft::handle_t const& handle,
+                    rmm::device_uvector<int64_t>&& d_src_v,
+                    rmm::device_uvector<int64_t>&& d_dst_v,
+                    std::optional<rmm::device_uvector<float>>&& optional_d_weights_v);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+symmetrize_edgelist(raft::handle_t const& handle,
+                    rmm::device_uvector<int32_t>&& d_src_v,
+                    rmm::device_uvector<int32_t>&& d_dst_v,
+                    std::optional<rmm::device_uvector<double>>&& optional_d_weights_v);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+symmetrize_edgelist(raft::handle_t const& handle,
+                    rmm::device_uvector<int64_t>&& d_src_v,
+                    rmm::device_uvector<int64_t>&& d_dst_v,
+                    std::optional<rmm::device_uvector<double>>&& optional_d_weights_v);
+
+}  // namespace cugraph
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
new file mode 100644
index 00000000000..413e08962e7
--- /dev/null
+++ b/cpp/src/generators/simple_generators.cu
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/sequence.h>
+
+#include <raft/cudart_utils.h>
+
+namespace cugraph {
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_path_graph_edgelist(raft::handle_t const& handle,
+                             std::vector<std::tuple<vertex_t, vertex_t>> const& component_parms_v)
+{
+  size_t num_edges = thrust::transform_reduce(
+    thrust::host,
+    component_parms_v.begin(),
+    component_parms_v.end(),
+    [](auto tuple) { return (std::get<0>(tuple) - 1); },
+    size_t{0},
+    std::plus<size_t>());
+
+  bool edge_off_end{false};
+
+  if (handle.comms_initialized()) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    if (comm_size > 1) {
+      if (comm_rank < comm_size) {
+        num_edges += component_parms_v.size();
+        edge_off_end = true;
+      }
+    }
+  }
+
+  rmm::device_uvector<vertex_t> d_src_v(num_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst_v(num_edges, handle.get_stream());
+
+  auto src_iterator = d_src_v.begin();
+  auto dst_iterator = d_dst_v.begin();
+
+  for (auto tuple : component_parms_v) {
+    vertex_t num_vertices, base_vertex_id;
+    std::tie(num_vertices, base_vertex_id) = tuple;
+
+    vertex_t num_edges{num_vertices - 1};
+
+    if (edge_off_end) ++num_edges;
+
+    thrust::sequence(rmm::exec_policy(handle.get_stream()),
+                     src_iterator,
+                     src_iterator + num_edges,
+                     base_vertex_id);
+
+    thrust::sequence(rmm::exec_policy(handle.get_stream()),
+                     dst_iterator,
+                     dst_iterator + num_edges,
+                     base_vertex_id + 1);
+
+    src_iterator += num_edges;
+    dst_iterator += num_edges;
+  }
+
+  handle.get_stream_view().synchronize();
+
+  return std::make_tuple(std::move(d_src_v), std::move(d_dst_v));
+}
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_2d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t>> const& component_parms_v)
+{
+  size_t num_edges = thrust::transform_reduce(
+    thrust::host,
+    component_parms_v.begin(),
+    component_parms_v.end(),
+    [](auto tuple) {
+      vertex_t x, y;
+      std::tie(x, y, std::ignore) = tuple;
+
+      return ((x - 1) * y) + (x * (y - 1));
+    },
+    size_t{0},
+    std::plus<size_t>());
+
+  rmm::device_uvector<vertex_t> d_src_v(num_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst_v(num_edges, handle.get_stream());
+
+  auto output_iterator =
+    thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
+
+  for (auto tuple : component_parms_v) {
+    vertex_t x, y, base_vertex_id;
+    std::tie(x, y, base_vertex_id) = tuple;
+
+    vertex_t num_vertices = x * y;
+
+    auto x_iterator = thrust::make_zip_iterator(
+      thrust::make_tuple(thrust::make_counting_iterator<vertex_t>(base_vertex_id),
+                         thrust::make_counting_iterator<vertex_t>(base_vertex_id + 1)));
+
+    output_iterator = thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                                      x_iterator,
+                                      x_iterator + num_vertices - 1,
+                                      output_iterator,
+                                      [base_vertex_id, x] __device__(auto pair) {
+                                        vertex_t dst = thrust::get<1>(pair);
+                                        // Want to skip if dst is in the last column of a graph
+                                        return ((dst - base_vertex_id) % x) != 0;
+                                      });
+
+    auto y_iterator = thrust::make_zip_iterator(
+      thrust::make_tuple(thrust::make_counting_iterator<vertex_t>(base_vertex_id),
+                         thrust::make_counting_iterator<vertex_t>(base_vertex_id + x)));
+
+    output_iterator = thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                                      y_iterator,
+                                      y_iterator + num_vertices - x,
+                                      output_iterator,
+                                      [base_vertex_id, x, y] __device__(auto pair) {
+                                        vertex_t dst = thrust::get<1>(pair);
+
+                                        // Want to skip if dst is in the first row of a new graph
+                                        return ((dst - base_vertex_id) % (x * y)) >= x;
+                                      });
+  }
+
+  handle.get_stream_view().synchronize();
+
+  return std::make_tuple(std::move(d_src_v), std::move(d_dst_v));
+}
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_3d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t, vertex_t>> const& component_parms_v)
+{
+  size_t num_edges = thrust::transform_reduce(
+    thrust::host,
+    component_parms_v.begin(),
+    component_parms_v.end(),
+    [](auto tuple) {
+      vertex_t x, y, z;
+      std::tie(x, y, z, std::ignore) = tuple;
+
+      return ((x - 1) * y * z) + (x * (y - 1) * z) + (x * y * (z - 1));
+    },
+    size_t{0},
+    std::plus<size_t>());
+
+  rmm::device_uvector<vertex_t> d_src_v(num_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst_v(num_edges, handle.get_stream());
+
+  auto output_iterator =
+    thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
+
+  for (auto tuple : component_parms_v) {
+    vertex_t x, y, z, base_vertex_id;
+    std::tie(x, y, z, base_vertex_id) = tuple;
+
+    vertex_t num_vertices = x * y * z;
+
+    auto x_iterator = thrust::make_zip_iterator(
+      thrust::make_tuple(thrust::make_counting_iterator<vertex_t>(base_vertex_id),
+                         thrust::make_counting_iterator<vertex_t>(base_vertex_id + 1)));
+
+    output_iterator = thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                                      x_iterator,
+                                      x_iterator + num_vertices - 1,
+                                      output_iterator,
+                                      [base_vertex_id, x] __device__(auto pair) {
+                                        vertex_t dst = thrust::get<1>(pair);
+                                        // Want to skip if dst is in the last column of a graph
+                                        return ((dst - base_vertex_id) % x) != 0;
+                                      });
+
+    auto y_iterator = thrust::make_zip_iterator(
+      thrust::make_tuple(thrust::make_counting_iterator<vertex_t>(base_vertex_id),
+                         thrust::make_counting_iterator<vertex_t>(base_vertex_id + x)));
+
+    output_iterator = thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                                      y_iterator,
+                                      y_iterator + num_vertices - x,
+                                      output_iterator,
+                                      [base_vertex_id, x, y] __device__(auto pair) {
+                                        vertex_t dst = thrust::get<1>(pair);
+                                        // Want to skip if dst is in the first row of a new graph
+                                        return ((dst - base_vertex_id) % (x * y)) >= x;
+                                      });
+
+    auto z_iterator = thrust::make_zip_iterator(
+      thrust::make_tuple(thrust::make_counting_iterator<vertex_t>(base_vertex_id),
+                         thrust::make_counting_iterator<vertex_t>(base_vertex_id + x * y)));
+
+    output_iterator = thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                                      z_iterator,
+                                      z_iterator + num_vertices - x * y,
+                                      output_iterator,
+                                      [base_vertex_id, x, y, z] __device__(auto pair) {
+                                        vertex_t dst = thrust::get<1>(pair);
+                                        // Want to skip if dst is in the first row of a new graph
+                                        return ((dst - base_vertex_id) % (x * y * z)) >= (x * y);
+                                      });
+  }
+
+  handle.get_stream_view().synchronize();
+
+  return std::make_tuple(std::move(d_src_v), std::move(d_dst_v));
+}
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_complete_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<vertex_t, vertex_t>> const& component_parms_v)
+{
+  std::for_each(component_parms_v.begin(), component_parms_v.end(), [](auto tuple) {
+    vertex_t num_vertices = std::get<0>(tuple);
+    CUGRAPH_EXPECTS(num_vertices < std::numeric_limits<int32_t>::max(),
+                    "Implementation cannot support specified value");
+  });
+
+  size_t num_edges = thrust::transform_reduce(
+    thrust::host,
+    component_parms_v.begin(),
+    component_parms_v.end(),
+    [](auto tuple) {
+      vertex_t num_vertices = std::get<0>(tuple);
+      return num_vertices * (num_vertices - 1) / 2;
+    },
+    size_t{0},
+    std::plus<size_t>());
+
+  vertex_t invalid_vertex{std::numeric_limits<vertex_t>::max()};
+
+  rmm::device_uvector<vertex_t> d_src_v(num_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst_v(num_edges, handle.get_stream());
+
+  auto output_iterator =
+    thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
+
+  for (auto tuple : component_parms_v) {
+    vertex_t num_vertices, base_vertex_id;
+    std::tie(num_vertices, base_vertex_id) = tuple;
+
+    auto transform_iter = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<size_t>(0),
+      [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
+        size_t graph_index = index / (num_vertices * num_vertices);
+        size_t local_index = index % (num_vertices * num_vertices);
+
+        vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
+        vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
+
+        if (src == dst) {
+          src = invalid_vertex;
+          dst = invalid_vertex;
+        } else {
+          src += (graph_index * num_vertices);
+          dst += (graph_index * num_vertices);
+        }
+
+        return thrust::make_tuple(src, dst);
+      });
+
+    output_iterator = thrust::copy_if(rmm::exec_policy(handle.get_stream()),
+                                      transform_iter,
+                                      transform_iter + num_vertices * num_vertices,
+                                      output_iterator,
+                                      [invalid_vertex] __device__(auto tuple) {
+                                        auto src = thrust::get<0>(tuple);
+                                        auto dst = thrust::get<1>(tuple);
+
+                                        return (src != invalid_vertex) && (src < dst);
+                                      });
+  }
+
+  handle.get_stream_view().synchronize();
+
+  return std::make_tuple(std::move(d_src_v), std::move(d_dst_v));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_path_graph_edgelist(raft::handle_t const& handle,
+                             std::vector<std::tuple<int32_t, int32_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_path_graph_edgelist(raft::handle_t const& handle,
+                             std::vector<std::tuple<int64_t, int64_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_2d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<int32_t, int32_t, int32_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_2d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_3d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<int32_t, int32_t, int32_t, int32_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_3d_mesh_graph_edgelist(
+  raft::handle_t const& handle,
+  std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_complete_graph_edgelist(
+  raft::handle_t const& handle, std::vector<std::tuple<int32_t, int32_t>> const& component_parms_v);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_complete_graph_edgelist(
+  raft::handle_t const& handle, std::vector<std::tuple<int64_t, int64_t>> const& component_parms_v);
+
+}  // namespace cugraph
diff --git a/cpp/src/layout/barnes_hut.cuh b/cpp/src/layout/barnes_hut.cuh
new file mode 100644
index 00000000000..d05c6051d8b
--- /dev/null
+++ b/cpp/src/layout/barnes_hut.cuh
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "bh_kernels.cuh"
+#include "fa2_kernels.cuh"
+#include "utils.hpp"
+
+#include <converters/COOtoCSR.cuh>
+#include <utilities/graph_utils.cuh>
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/internals.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cugraph {
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void barnes_hut(raft::handle_t const& handle,
+                legacy::GraphCOOView<vertex_t, edge_t, weight_t>& graph,
+                float* pos,
+                const int max_iter                            = 500,
+                float* x_start                                = nullptr,
+                float* y_start                                = nullptr,
+                bool outbound_attraction_distribution         = true,
+                bool lin_log_mode                             = false,
+                bool prevent_overlapping                      = false,
+                const float edge_weight_influence             = 1.0,
+                const float jitter_tolerance                  = 1.0,
+                const float theta                             = 0.5,
+                const float scaling_ratio                     = 2.0,
+                bool strong_gravity_mode                      = false,
+                const float gravity                           = 1.0,
+                bool verbose                                  = false,
+                internals::GraphBasedDimRedCallback* callback = nullptr)
+{
+  rmm::cuda_stream_view stream_view(handle.get_stream_view());
+  const edge_t e   = graph.number_of_edges;
+  const vertex_t n = graph.number_of_vertices;
+
+  const int blocks = getMultiProcessorCount();
+  // A tiny jitter to promote numerical stability/
+  const float epssq = 0.0025;
+  // We use the same array for nodes and cells.
+  int nnodes = n * 2;
+  if (nnodes < 1024 * blocks) nnodes = 1024 * blocks;
+  while ((nnodes & (32 - 1)) != 0)
+    nnodes++;
+  nnodes--;
+
+  // Allocate more space
+  //---------------------------------------------------
+  rmm::device_uvector<unsigned> d_limiter(1, stream_view);
+  rmm::device_uvector<int> d_maxdepthd(1, stream_view);
+  rmm::device_uvector<int> d_bottomd(1, stream_view);
+  rmm::device_uvector<float> d_radiusd(1, stream_view);
+
+  unsigned* limiter = d_limiter.data();
+  int* maxdepthd    = d_maxdepthd.data();
+  int* bottomd      = d_bottomd.data();
+  float* radiusd    = d_radiusd.data();
+
+  InitializationKernel<<<1, 1, 0, stream_view.value()>>>(limiter, maxdepthd, radiusd);
+  CHECK_CUDA(stream_view.value());
+
+  const int FOUR_NNODES     = 4 * nnodes;
+  const int FOUR_N          = 4 * n;
+  const float theta_squared = theta * theta;
+  const int NNODES          = nnodes;
+
+  rmm::device_uvector<int> d_startl(nnodes + 1, stream_view);
+  rmm::device_uvector<int> d_childl((nnodes + 1) * 4, stream_view);
+  // FA2 requires degree + 1
+  rmm::device_uvector<int> d_massl(nnodes + 1, stream_view);
+  thrust::fill(rmm::exec_policy(stream_view), d_massl.begin(), d_massl.end(), 1);
+
+  rmm::device_uvector<float> d_maxxl(blocks * FACTOR1, stream_view);
+  rmm::device_uvector<float> d_maxyl(blocks * FACTOR1, stream_view);
+  rmm::device_uvector<float> d_minxl(blocks * FACTOR1, stream_view);
+  rmm::device_uvector<float> d_minyl(blocks * FACTOR1, stream_view);
+
+  // Actual mallocs
+  int* startl = d_startl.data();
+  int* childl = d_childl.data();
+  int* massl  = d_massl.data();
+
+  float* maxxl = d_maxxl.data();
+  float* maxyl = d_maxyl.data();
+  float* minxl = d_minxl.data();
+  float* minyl = d_minyl.data();
+
+  // SummarizationKernel
+  rmm::device_uvector<int> d_countl(nnodes + 1, stream_view);
+  int* countl = d_countl.data();
+
+  // SortKernel
+  rmm::device_uvector<int> d_sortl(nnodes + 1, stream_view);
+  int* sortl = d_sortl.data();
+
+  // RepulsionKernel
+  rmm::device_uvector<float> d_rep_forces((nnodes + 1) * 2, stream_view);
+  float* rep_forces = d_rep_forces.data();
+
+  rmm::device_uvector<float> d_radius_squared(1, stream_view);
+  float* radiusd_squared = d_radius_squared.data();
+
+  rmm::device_uvector<float> d_nodes_pos((nnodes + 1) * 2, stream_view);
+  float* nodes_pos = d_nodes_pos.data();
+
+  // Initialize positions with random values
+  int random_state = 0;
+
+  // Copy start x and y positions.
+  if (x_start && y_start) {
+    raft::copy(nodes_pos, x_start, n, stream_view.value());
+    raft::copy(nodes_pos + nnodes + 1, y_start, n, stream_view.value());
+  } else {
+    uniform_random_fill(
+      handle.get_stream_view(), nodes_pos, (nnodes + 1) * 2, -100.0f, 100.0f, random_state);
+  }
+
+  // Allocate arrays for force computation
+  float* attract{nullptr};
+  float* old_forces{nullptr};
+  float* swinging{nullptr};
+  float* traction{nullptr};
+
+  rmm::device_uvector<float> d_attract(n * 2, stream_view);
+  rmm::device_uvector<float> d_old_forces(n * 2, stream_view);
+  rmm::device_uvector<float> d_swinging(n, stream_view);
+  rmm::device_uvector<float> d_traction(n, stream_view);
+
+  attract    = d_attract.data();
+  old_forces = d_old_forces.data();
+  swinging   = d_swinging.data();
+  traction   = d_traction.data();
+
+  thrust::fill(rmm::exec_policy(stream_view), d_old_forces.begin(), d_old_forces.end(), 0.f);
+
+  // Sort COO for coalesced memory access.
+  sort(graph, stream_view.value());
+  CHECK_CUDA(stream_view.value());
+
+  graph.degree(massl, cugraph::legacy::DegreeDirection::OUT);
+  CHECK_CUDA(stream_view.value());
+
+  const vertex_t* row = graph.src_indices;
+  const vertex_t* col = graph.dst_indices;
+  const weight_t* v   = graph.edge_data;
+
+  // Scalars used to adapt global speed.
+  float speed                     = 1.f;
+  float speed_efficiency          = 1.f;
+  float outbound_att_compensation = 1.f;
+  float jt                        = 0.f;
+
+  // If outboundAttractionDistribution active, compensate.
+  if (outbound_attraction_distribution) {
+    int sum = thrust::reduce(rmm::exec_policy(stream_view), d_massl.begin(), d_massl.begin() + n);
+    outbound_att_compensation = sum / (float)n;
+  }
+
+  //
+  // Set cache levels for faster algorithm execution
+  //---------------------------------------------------
+  cudaFuncSetCacheConfig(BoundingBoxKernel, cudaFuncCachePreferShared);
+  cudaFuncSetCacheConfig(TreeBuildingKernel, cudaFuncCachePreferL1);
+  cudaFuncSetCacheConfig(ClearKernel1, cudaFuncCachePreferL1);
+  cudaFuncSetCacheConfig(ClearKernel2, cudaFuncCachePreferL1);
+  cudaFuncSetCacheConfig(SummarizationKernel, cudaFuncCachePreferShared);
+  cudaFuncSetCacheConfig(SortKernel, cudaFuncCachePreferL1);
+  cudaFuncSetCacheConfig(RepulsionKernel, cudaFuncCachePreferL1);
+  cudaFuncSetCacheConfig(apply_forces_bh, cudaFuncCachePreferL1);
+
+  if (callback) {
+    callback->setup<float>(nnodes + 1, 2);
+    callback->on_preprocess_end(nodes_pos);
+  }
+
+  for (int iter = 0; iter < max_iter; ++iter) {
+    // Reset force values
+    thrust::fill(rmm::exec_policy(stream_view), d_rep_forces.begin(), d_rep_forces.end(), 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), d_attract.begin(), d_attract.end(), 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), d_swinging.begin(), d_swinging.end(), 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), d_traction.begin(), d_traction.end(), 0.f);
+
+    ResetKernel<<<1, 1, 0, stream_view.value()>>>(radiusd_squared, bottomd, NNODES, radiusd);
+    CHECK_CUDA(stream_view.value());
+
+    // Compute bounding box arround all bodies
+    BoundingBoxKernel<<<blocks * FACTOR1, THREADS1, 0, stream_view.value()>>>(
+      startl,
+      childl,
+      massl,
+      nodes_pos,
+      nodes_pos + nnodes + 1,
+      maxxl,
+      maxyl,
+      minxl,
+      minyl,
+      FOUR_NNODES,
+      NNODES,
+      n,
+      limiter,
+      radiusd);
+    CHECK_CUDA(stream_view.value());
+
+    ClearKernel1<<<blocks, 1024, 0, stream_view.value()>>>(childl, FOUR_NNODES, FOUR_N);
+    CHECK_CUDA(stream_view.value());
+
+    // Build quadtree
+    TreeBuildingKernel<<<blocks * FACTOR2, THREADS2, 0, stream_view.value()>>>(
+      childl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, maxdepthd, bottomd, radiusd);
+    CHECK_CUDA(stream_view.value());
+
+    ClearKernel2<<<blocks, 1024, 0, stream_view.value()>>>(startl, massl, NNODES, bottomd);
+    CHECK_CUDA(stream_view.value());
+
+    // Summarizes mass and position for each cell, bottom up approach
+    SummarizationKernel<<<blocks * FACTOR3, THREADS3, 0, stream_view.value()>>>(
+      countl, childl, massl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, bottomd);
+    CHECK_CUDA(stream_view.value());
+
+    // Group closed bodies together, used to speed up Repulsion kernel
+    SortKernel<<<blocks * FACTOR4, THREADS4, 0, stream_view.value()>>>(
+      sortl, countl, startl, childl, NNODES, n, bottomd);
+    CHECK_CUDA(stream_view.value());
+
+    // Force computation O(n . log(n))
+    RepulsionKernel<<<blocks * FACTOR5, THREADS5, 0, stream_view.value()>>>(scaling_ratio,
+                                                                            theta,
+                                                                            epssq,
+                                                                            sortl,
+                                                                            childl,
+                                                                            massl,
+                                                                            nodes_pos,
+                                                                            nodes_pos + nnodes + 1,
+                                                                            rep_forces,
+                                                                            rep_forces + nnodes + 1,
+                                                                            theta_squared,
+                                                                            NNODES,
+                                                                            FOUR_NNODES,
+                                                                            n,
+                                                                            radiusd_squared,
+                                                                            maxdepthd);
+    CHECK_CUDA(stream_view.value());
+
+    apply_gravity<vertex_t>(nodes_pos,
+                            nodes_pos + nnodes + 1,
+                            attract,
+                            attract + n,
+                            massl,
+                            gravity,
+                            strong_gravity_mode,
+                            scaling_ratio,
+                            n,
+                            stream_view.value());
+
+    apply_attraction<vertex_t, edge_t, weight_t>(row,
+                                                 col,
+                                                 v,
+                                                 e,
+                                                 nodes_pos,
+                                                 nodes_pos + nnodes + 1,
+                                                 attract,
+                                                 attract + n,
+                                                 massl,
+                                                 outbound_attraction_distribution,
+                                                 lin_log_mode,
+                                                 edge_weight_influence,
+                                                 outbound_att_compensation,
+                                                 stream_view.value());
+
+    compute_local_speed(rep_forces,
+                        rep_forces + nnodes + 1,
+                        attract,
+                        attract + n,
+                        old_forces,
+                        old_forces + n,
+                        massl,
+                        swinging,
+                        traction,
+                        n,
+                        stream_view.value());
+
+    // Compute global swinging and traction values
+    const float s =
+      thrust::reduce(rmm::exec_policy(stream_view), d_swinging.begin(), d_swinging.end());
+
+    const float t =
+      thrust::reduce(rmm::exec_policy(stream_view), d_traction.begin(), d_traction.end());
+
+    // Compute global speed based on gloab and local swinging and traction.
+    adapt_speed<vertex_t>(jitter_tolerance, &jt, &speed, &speed_efficiency, s, t, n);
+
+    // Update positions
+    apply_forces_bh<<<blocks * FACTOR6, THREADS6, 0, stream_view.value()>>>(nodes_pos,
+                                                                            nodes_pos + nnodes + 1,
+                                                                            attract,
+                                                                            attract + n,
+                                                                            rep_forces,
+                                                                            rep_forces + nnodes + 1,
+                                                                            old_forces,
+                                                                            old_forces + n,
+                                                                            swinging,
+                                                                            speed,
+                                                                            n);
+
+    if (callback) callback->on_epoch_end(nodes_pos);
+
+    if (verbose) {
+      std::cout << "iteration: " << iter + 1 << ", speed: " << speed
+                << ", speed_efficiency: " << speed_efficiency << ", jt: " << jt
+                << ", swinging: " << s << ", traction: " << t << "\n";
+    }
+  }
+
+  // Copy nodes positions into final output pos
+  raft::copy(pos, nodes_pos, n, stream_view.value());
+  raft::copy(pos + n, nodes_pos + nnodes + 1, n, stream_view.value());
+
+  if (callback) callback->on_train_end(nodes_pos);
+}
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/layout/barnes_hut.hpp b/cpp/src/layout/barnes_hut.hpp
deleted file mode 100644
index 437c98fce4b..00000000000
--- a/cpp/src/layout/barnes_hut.hpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
-
-#include <stdio.h>
-#include <converters/COOtoCSR.cuh>
-#include <graph.hpp>
-#include <internals.hpp>
-
-#include "bh_kernels.hpp"
-#include "fa2_kernels.hpp"
-#include "utilities/graph_utils.cuh"
-#include "utils.hpp"
-
-namespace cugraph {
-namespace detail {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void barnes_hut(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
-                float *pos,
-                const int max_iter                            = 500,
-                float *x_start                                = nullptr,
-                float *y_start                                = nullptr,
-                bool outbound_attraction_distribution         = true,
-                bool lin_log_mode                             = false,
-                bool prevent_overlapping                      = false,
-                const float edge_weight_influence             = 1.0,
-                const float jitter_tolerance                  = 1.0,
-                const float theta                             = 0.5,
-                const float scaling_ratio                     = 2.0,
-                bool strong_gravity_mode                      = false,
-                const float gravity                           = 1.0,
-                bool verbose                                  = false,
-                internals::GraphBasedDimRedCallback *callback = nullptr)
-{
-  cudaStream_t stream = {nullptr};
-  const edge_t e      = graph.number_of_edges;
-  const vertex_t n    = graph.number_of_vertices;
-
-  const int blocks = getMultiProcessorCount();
-  // A tiny jitter to promote numerical stability/
-  const float epssq = 0.0025;
-  // We use the same array for nodes and cells.
-  int nnodes = n * 2;
-  if (nnodes < 1024 * blocks) nnodes = 1024 * blocks;
-  while ((nnodes & (32 - 1)) != 0) nnodes++;
-  nnodes--;
-
-  // Allocate more space
-  //---------------------------------------------------
-  rmm::device_vector<unsigned> d_limiter(1);
-  rmm::device_vector<int> d_maxdepthd(1);
-  rmm::device_vector<int> d_bottomd(1);
-  rmm::device_vector<float> d_radiusd(1);
-
-  unsigned *limiter = d_limiter.data().get();
-  int *maxdepthd    = d_maxdepthd.data().get();
-  int *bottomd      = d_bottomd.data().get();
-  float *radiusd    = d_radiusd.data().get();
-
-  InitializationKernel<<<1, 1, 0, stream>>>(limiter, maxdepthd, radiusd);
-  CHECK_CUDA(stream);
-
-  const int FOUR_NNODES     = 4 * nnodes;
-  const int FOUR_N          = 4 * n;
-  const float theta_squared = theta * theta;
-  const int NNODES          = nnodes;
-
-  rmm::device_vector<int> d_startl(nnodes + 1, 0);
-  rmm::device_vector<int> d_childl((nnodes + 1) * 4, 0);
-  // FA2 requires degree + 1
-  rmm::device_vector<int> d_massl(nnodes + 1, 1.f);
-
-  rmm::device_vector<float> d_maxxl(blocks * FACTOR1, 0);
-  rmm::device_vector<float> d_maxyl(blocks * FACTOR1, 0);
-  rmm::device_vector<float> d_minxl(blocks * FACTOR1, 0);
-  rmm::device_vector<float> d_minyl(blocks * FACTOR1, 0);
-
-  // Actual mallocs
-  int *startl = d_startl.data().get();
-  int *childl = d_childl.data().get();
-  int *massl  = d_massl.data().get();
-
-  float *maxxl = d_maxxl.data().get();
-  float *maxyl = d_maxyl.data().get();
-  float *minxl = d_minxl.data().get();
-  float *minyl = d_minyl.data().get();
-
-  // SummarizationKernel
-  rmm::device_vector<int> d_countl(nnodes + 1, 0);
-  int *countl = d_countl.data().get();
-
-  // SortKernel
-  rmm::device_vector<int> d_sortl(nnodes + 1, 0);
-  int *sortl = d_sortl.data().get();
-
-  // RepulsionKernel
-  rmm::device_vector<float> d_rep_forces((nnodes + 1) * 2, 0);
-  float *rep_forces = d_rep_forces.data().get();
-
-  rmm::device_vector<float> d_radius_squared(1, 0);
-  float *radiusd_squared = d_radius_squared.data().get();
-
-  rmm::device_vector<float> d_nodes_pos((nnodes + 1) * 2, 0);
-  float *nodes_pos = d_nodes_pos.data().get();
-
-  // Initialize positions with random values
-  int random_state = 0;
-
-  // Copy start x and y positions.
-  if (x_start && y_start) {
-    copy(n, x_start, nodes_pos);
-    copy(n, y_start, nodes_pos + nnodes + 1);
-  } else {
-    random_vector(nodes_pos, (nnodes + 1) * 2, random_state, stream);
-  }
-
-  // Allocate arrays for force computation
-  float *attract{nullptr};
-  float *old_forces{nullptr};
-  float *swinging{nullptr};
-  float *traction{nullptr};
-
-  rmm::device_vector<float> d_attract(n * 2, 0);
-  rmm::device_vector<float> d_old_forces(n * 2, 0);
-  rmm::device_vector<float> d_swinging(n, 0);
-  rmm::device_vector<float> d_traction(n, 0);
-
-  attract    = d_attract.data().get();
-  old_forces = d_old_forces.data().get();
-  swinging   = d_swinging.data().get();
-  traction   = d_traction.data().get();
-
-  // Sort COO for coalesced memory access.
-  sort(graph, stream);
-  CHECK_CUDA(stream);
-
-  graph.degree(massl, cugraph::DegreeDirection::OUT);
-  CHECK_CUDA(stream);
-
-  const vertex_t *row = graph.src_indices;
-  const vertex_t *col = graph.dst_indices;
-  const weight_t *v   = graph.edge_data;
-
-  // Scalars used to adapt global speed.
-  float speed                     = 1.f;
-  float speed_efficiency          = 1.f;
-  float outbound_att_compensation = 1.f;
-  float jt                        = 0.f;
-
-  // If outboundAttractionDistribution active, compensate.
-  if (outbound_attraction_distribution) {
-    int sum =
-      thrust::reduce(rmm::exec_policy(stream)->on(stream), d_massl.begin(), d_massl.begin() + n);
-    outbound_att_compensation = sum / (float)n;
-  }
-
-  //
-  // Set cache levels for faster algorithm execution
-  //---------------------------------------------------
-  cudaFuncSetCacheConfig(BoundingBoxKernel, cudaFuncCachePreferShared);
-  cudaFuncSetCacheConfig(TreeBuildingKernel, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(ClearKernel1, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(ClearKernel2, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(SummarizationKernel, cudaFuncCachePreferShared);
-  cudaFuncSetCacheConfig(SortKernel, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(RepulsionKernel, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(apply_forces_bh, cudaFuncCachePreferL1);
-
-  if (callback) {
-    callback->setup<float>(nnodes + 1, 2);
-    callback->on_preprocess_end(nodes_pos);
-  }
-
-  for (int iter = 0; iter < max_iter; ++iter) {
-    // Reset force values
-    fill((nnodes + 1) * 2, rep_forces, 0.f);
-    fill(n * 2, attract, 0.f);
-    fill(n, swinging, 0.f);
-    fill(n, traction, 0.f);
-
-    ResetKernel<<<1, 1, 0, stream>>>(radiusd_squared, bottomd, NNODES, radiusd);
-    CHECK_CUDA(stream);
-
-    // Compute bounding box arround all bodies
-    BoundingBoxKernel<<<blocks * FACTOR1, THREADS1, 0, stream>>>(startl,
-                                                                 childl,
-                                                                 massl,
-                                                                 nodes_pos,
-                                                                 nodes_pos + nnodes + 1,
-                                                                 maxxl,
-                                                                 maxyl,
-                                                                 minxl,
-                                                                 minyl,
-                                                                 FOUR_NNODES,
-                                                                 NNODES,
-                                                                 n,
-                                                                 limiter,
-                                                                 radiusd);
-    CHECK_CUDA(stream);
-
-    ClearKernel1<<<blocks, 1024, 0, stream>>>(childl, FOUR_NNODES, FOUR_N);
-    CHECK_CUDA(stream);
-
-    // Build quadtree
-    TreeBuildingKernel<<<blocks * FACTOR2, THREADS2, 0, stream>>>(
-      childl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, maxdepthd, bottomd, radiusd);
-    CHECK_CUDA(stream);
-
-    ClearKernel2<<<blocks, 1024, 0, stream>>>(startl, massl, NNODES, bottomd);
-    CHECK_CUDA(stream);
-
-    // Summarizes mass and position for each cell, bottom up approach
-    SummarizationKernel<<<blocks * FACTOR3, THREADS3, 0, stream>>>(
-      countl, childl, massl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, bottomd);
-    CHECK_CUDA(stream);
-
-    // Group closed bodies together, used to speed up Repulsion kernel
-    SortKernel<<<blocks * FACTOR4, THREADS4, 0, stream>>>(
-      sortl, countl, startl, childl, NNODES, n, bottomd);
-    CHECK_CUDA(stream);
-
-    // Force computation O(n . log(n))
-    RepulsionKernel<<<blocks * FACTOR5, THREADS5, 0, stream>>>(scaling_ratio,
-                                                               theta,
-                                                               epssq,
-                                                               sortl,
-                                                               childl,
-                                                               massl,
-                                                               nodes_pos,
-                                                               nodes_pos + nnodes + 1,
-                                                               rep_forces,
-                                                               rep_forces + nnodes + 1,
-                                                               theta_squared,
-                                                               NNODES,
-                                                               FOUR_NNODES,
-                                                               n,
-                                                               radiusd_squared,
-                                                               maxdepthd);
-    CHECK_CUDA(stream);
-
-    apply_gravity<vertex_t>(nodes_pos,
-                            nodes_pos + nnodes + 1,
-                            attract,
-                            attract + n,
-                            massl,
-                            gravity,
-                            strong_gravity_mode,
-                            scaling_ratio,
-                            n,
-                            stream);
-
-    apply_attraction<vertex_t, edge_t, weight_t>(row,
-                                                 col,
-                                                 v,
-                                                 e,
-                                                 nodes_pos,
-                                                 nodes_pos + nnodes + 1,
-                                                 attract,
-                                                 attract + n,
-                                                 massl,
-                                                 outbound_attraction_distribution,
-                                                 lin_log_mode,
-                                                 edge_weight_influence,
-                                                 outbound_att_compensation,
-                                                 stream);
-
-    compute_local_speed(rep_forces,
-                        rep_forces + nnodes + 1,
-                        attract,
-                        attract + n,
-                        old_forces,
-                        old_forces + n,
-                        massl,
-                        swinging,
-                        traction,
-                        n,
-                        stream);
-
-    // Compute global swinging and traction values
-    const float s =
-      thrust::reduce(rmm::exec_policy(stream)->on(stream), d_swinging.begin(), d_swinging.end());
-
-    const float t =
-      thrust::reduce(rmm::exec_policy(stream)->on(stream), d_traction.begin(), d_traction.end());
-
-    // Compute global speed based on gloab and local swinging and traction.
-    adapt_speed<vertex_t>(jitter_tolerance, &jt, &speed, &speed_efficiency, s, t, n);
-
-    // Update positions
-    apply_forces_bh<<<blocks * FACTOR6, THREADS6, 0, stream>>>(nodes_pos,
-                                                               nodes_pos + nnodes + 1,
-                                                               attract,
-                                                               attract + n,
-                                                               rep_forces,
-                                                               rep_forces + nnodes + 1,
-                                                               old_forces,
-                                                               old_forces + n,
-                                                               swinging,
-                                                               speed,
-                                                               n);
-
-    if (callback) callback->on_epoch_end(nodes_pos);
-
-    if (verbose) {
-      printf("iteration %i, speed: %f, speed_efficiency: %f, ", iter + 1, speed, speed_efficiency);
-      printf("jt: %f, ", jt);
-      printf("swinging: %f, traction: %f\n", s, t);
-    }
-  }
-
-  // Copy nodes positions into final output pos
-  copy(n, nodes_pos, pos);
-  copy(n, nodes_pos + nnodes + 1, pos + n);
-
-  if (callback) callback->on_train_end(nodes_pos);
-}
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/layout/bh_kernels.hpp b/cpp/src/layout/bh_kernels.cuh
similarity index 88%
rename from cpp/src/layout/bh_kernels.hpp
rename to cpp/src/layout/bh_kernels.cuh
index 5c170e147c9..8dbc08a6440 100644
--- a/cpp/src/layout/bh_kernels.hpp
+++ b/cpp/src/layout/bh_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,9 +42,9 @@ namespace detail {
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
-__global__ void InitializationKernel(unsigned *restrict limiter,
-                                     int *restrict maxdepthd,
-                                     float *restrict radiusd)
+__global__ void InitializationKernel(unsigned* restrict limiter,
+                                     int* restrict maxdepthd,
+                                     float* restrict radiusd)
 {
   maxdepthd[0] = 1;
   limiter[0]   = 0;
@@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned *restrict limiter,
 /**
  * Reset root.
  */
-__global__ void ResetKernel(float *restrict radiusd_squared,
-                            int *restrict bottomd,
+__global__ void ResetKernel(float* restrict radiusd_squared,
+                            int* restrict bottomd,
                             const int NNODES,
-                            const float *restrict radiusd)
+                            const float* restrict radiusd)
 {
   radiusd_squared[0] = radiusd[0] * radiusd[0];
   // create root node
@@ -67,20 +67,20 @@ __global__ void ResetKernel(float *restrict radiusd_squared,
 /**
  * Figures the bounding boxes for every point in the embedding.
  */
-__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int *restrict startd,
-                                                                       int *restrict childd,
-                                                                       int *restrict massd,
-                                                                       float *restrict posxd,
-                                                                       float *restrict posyd,
-                                                                       float *restrict maxxd,
-                                                                       float *restrict maxyd,
-                                                                       float *restrict minxd,
-                                                                       float *restrict minyd,
+__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd,
+                                                                       int* restrict childd,
+                                                                       int* restrict massd,
+                                                                       float* restrict posxd,
+                                                                       float* restrict posyd,
+                                                                       float* restrict maxxd,
+                                                                       float* restrict maxyd,
+                                                                       float* restrict minxd,
+                                                                       float* restrict minyd,
                                                                        const int FOUR_NNODES,
                                                                        const int NNODES,
                                                                        const int N,
-                                                                       unsigned *restrict limiter,
-                                                                       float *restrict radiusd)
+                                                                       unsigned* restrict limiter,
+                                                                       float* restrict radiusd)
 {
   float val, minx, maxx, miny, maxy;
   __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1];
@@ -150,14 +150,15 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int *rest
     posyd[NNODES]  = (miny + maxy) * 0.5f;
 
 #pragma unroll
-    for (int a = 0; a < 4; a++) childd[FOUR_NNODES + a] = -1;
+    for (int a = 0; a < 4; a++)
+      childd[FOUR_NNODES + a] = -1;
   }
 }
 
 /**
  * Clear some of the state vectors up.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(int *restrict childd,
+__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
                                                         const int FOUR_NNODES,
                                                         const int FOUR_N)
 {
@@ -167,21 +168,22 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int *restrict childd,
 
 // iterate over all cells assigned to thread
 #pragma unroll
-  for (; k < FOUR_NNODES; k += inc) childd[k] = -1;
+  for (; k < FOUR_NNODES; k += inc)
+    childd[k] = -1;
 }
 
 /**
  * Build the actual KD Tree.
  */
 __global__ __launch_bounds__(THREADS2,
-                             FACTOR2) void TreeBuildingKernel(int *restrict childd,
-                                                              const float *restrict posxd,
-                                                              const float *restrict posyd,
+                             FACTOR2) void TreeBuildingKernel(int* restrict childd,
+                                                              const float* restrict posxd,
+                                                              const float* restrict posyd,
                                                               const int NNODES,
                                                               const int N,
-                                                              int *restrict maxdepthd,
-                                                              int *restrict bottomd,
-                                                              const float *restrict radiusd)
+                                                              int* restrict maxdepthd,
+                                                              int* restrict bottomd,
+                                                              const float* restrict radiusd)
 {
   int j, depth;
   float x, y, r;
@@ -294,10 +296,10 @@ __global__ __launch_bounds__(THREADS2,
 /**
  * Clean more state vectors.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(int *restrict startd,
-                                                        int *restrict massd,
+__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
+                                                        int* restrict massd,
                                                         const int NNODES,
-                                                        const int *restrict bottomd)
+                                                        const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int inc    = blockDim.x * gridDim.x;
@@ -316,14 +318,14 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int *restrict startd,
  * Summarize the KD Tree via cell gathering
  */
 __global__ __launch_bounds__(THREADS3,
-                             FACTOR3) void SummarizationKernel(int *restrict countd,
-                                                               const int *restrict childd,
-                                                               volatile int *restrict massd,
-                                                               float *restrict posxd,
-                                                               float *restrict posyd,
+                             FACTOR3) void SummarizationKernel(int* restrict countd,
+                                                               const int* restrict childd,
+                                                               volatile int* restrict massd,
+                                                               float* restrict posxd,
+                                                               float* restrict posyd,
                                                                const int NNODES,
                                                                const int N,
-                                                               const int *restrict bottomd)
+                                                               const int* restrict bottomd)
 {
   bool flag = 0;
   float cm, px, py;
@@ -451,13 +453,13 @@ __global__ __launch_bounds__(THREADS3,
 /**
  * Sort the cells
  */
-__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int *restrict sortd,
-                                                                const int *restrict countd,
-                                                                volatile int *restrict startd,
-                                                                int *restrict childd,
+__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd,
+                                                                const int* restrict countd,
+                                                                volatile int* restrict startd,
+                                                                int* restrict childd,
                                                                 const int NNODES,
                                                                 const int N,
-                                                                const int *restrict bottomd)
+                                                                const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int dec    = blockDim.x * gridDim.x;
@@ -505,19 +507,19 @@ __global__ __launch_bounds__(
                                           const float scaling_ratio,
                                           const float theta,
                                           const float epssqd,  // correction for zero distance
-                                          const int *restrict sortd,
-                                          const int *restrict childd,
-                                          const int *restrict massd,
-                                          const float *restrict posxd,
-                                          const float *restrict posyd,
-                                          float *restrict velxd,
-                                          float *restrict velyd,
+                                          const int* restrict sortd,
+                                          const int* restrict childd,
+                                          const int* restrict massd,
+                                          const float* restrict posxd,
+                                          const float* restrict posyd,
+                                          float* restrict velxd,
+                                          float* restrict velyd,
                                           const float theta_squared,
                                           const int NNODES,
                                           const int FOUR_NNODES,
                                           const int N,
-                                          const float *restrict radiusd_squared,
-                                          const int *restrict maxdepthd)
+                                          const float* restrict radiusd_squared,
+                                          const int* restrict maxdepthd)
 {
   __shared__ int pos[THREADS5], node[THREADS5];
   __shared__ float dq[THREADS5];
@@ -611,15 +613,15 @@ __global__ __launch_bounds__(
 }
 
 __global__ __launch_bounds__(THREADS6,
-                             FACTOR6) void apply_forces_bh(float *restrict Y_x,
-                                                           float *restrict Y_y,
-                                                           const float *restrict attract_x,
-                                                           const float *restrict attract_y,
-                                                           const float *restrict repel_x,
-                                                           const float *restrict repel_y,
-                                                           float *restrict old_dx,
-                                                           float *restrict old_dy,
-                                                           const float *restrict swinging,
+                             FACTOR6) void apply_forces_bh(float* restrict Y_x,
+                                                           float* restrict Y_y,
+                                                           const float* restrict attract_x,
+                                                           const float* restrict attract_y,
+                                                           const float* restrict repel_x,
+                                                           const float* restrict repel_y,
+                                                           float* restrict old_dx,
+                                                           float* restrict old_dy,
+                                                           const float* restrict swinging,
                                                            const float speed,
                                                            const int n)
 {
diff --git a/cpp/src/layout/exact_fa2.hpp b/cpp/src/layout/exact_fa2.cuh
similarity index 55%
rename from cpp/src/layout/exact_fa2.hpp
rename to cpp/src/layout/exact_fa2.cuh
index 0b90e417968..5b5c3f5e82e 100644
--- a/cpp/src/layout/exact_fa2.hpp
+++ b/cpp/src/layout/exact_fa2.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,27 +16,30 @@
 
 #pragma once
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
-#include <stdio.h>
 #include <converters/COOtoCSR.cuh>
-#include <graph.hpp>
-#include <internals.hpp>
 
-#include "exact_repulsion.hpp"
-#include "fa2_kernels.hpp"
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/internals.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include "exact_repulsion.cuh"
+#include "fa2_kernels.cuh"
 #include "utils.hpp"
 
 namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
-               float *pos,
+void exact_fa2(raft::handle_t const& handle,
+               legacy::GraphCOOView<vertex_t, edge_t, weight_t>& graph,
+               float* pos,
                const int max_iter                            = 500,
-               float *x_start                                = nullptr,
-               float *y_start                                = nullptr,
+               float* x_start                                = nullptr,
+               float* y_start                                = nullptr,
                bool outbound_attraction_distribution         = true,
                bool lin_log_mode                             = false,
                bool prevent_overlapping                      = false,
@@ -46,52 +49,53 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                bool strong_gravity_mode                      = false,
                const float gravity                           = 1.0,
                bool verbose                                  = false,
-               internals::GraphBasedDimRedCallback *callback = nullptr)
+               internals::GraphBasedDimRedCallback* callback = nullptr)
 {
-  cudaStream_t stream = {nullptr};
-  const edge_t e      = graph.number_of_edges;
-  const vertex_t n    = graph.number_of_vertices;
-
-  float *d_repel{nullptr};
-  float *d_attract{nullptr};
-  float *d_old_forces{nullptr};
-  int *d_mass{nullptr};
-  float *d_swinging{nullptr};
-  float *d_traction{nullptr};
-
-  rmm::device_vector<float> repel(n * 2, 0);
-  rmm::device_vector<float> attract(n * 2, 0);
-  rmm::device_vector<float> old_forces(n * 2, 0);
+  auto stream_view = handle.get_stream_view();
+  const edge_t e   = graph.number_of_edges;
+  const vertex_t n = graph.number_of_vertices;
+
+  float* d_repel{nullptr};
+  float* d_attract{nullptr};
+  float* d_old_forces{nullptr};
+  int* d_mass{nullptr};
+  float* d_swinging{nullptr};
+  float* d_traction{nullptr};
+
+  rmm::device_uvector<float> repel(n * 2, stream_view);
+  rmm::device_uvector<float> attract(n * 2, stream_view);
+  rmm::device_uvector<float> old_forces(n * 2, stream_view);
+  thrust::fill(rmm::exec_policy(stream_view), old_forces.begin(), old_forces.end(), 0.f);
   // FA2 requires degree + 1.
-  rmm::device_vector<int> mass(n, 1);
-  rmm::device_vector<float> swinging(n, 0);
-  rmm::device_vector<float> traction(n, 0);
+  rmm::device_uvector<int> mass(n, stream_view);
+  thrust::fill(rmm::exec_policy(stream_view), mass.begin(), mass.end(), 1);
+  rmm::device_uvector<float> swinging(n, stream_view);
+  rmm::device_uvector<float> traction(n, stream_view);
 
-  d_repel      = repel.data().get();
-  d_attract    = attract.data().get();
-  d_old_forces = old_forces.data().get();
-  d_mass       = mass.data().get();
-  d_swinging   = swinging.data().get();
-  d_traction   = traction.data().get();
+  d_repel      = repel.data();
+  d_attract    = attract.data();
+  d_old_forces = old_forces.data();
+  d_mass       = mass.data();
+  d_swinging   = swinging.data();
+  d_traction   = traction.data();
 
-  int random_state = 0;
-  random_vector(pos, n * 2, random_state, stream);
+  uniform_random_fill(handle.get_stream_view(), pos, n * 2, -100.0f, 100.0f, uint64_t{0});
 
   if (x_start && y_start) {
-    copy(n, x_start, pos);
-    copy(n, y_start, pos + n);
+    raft::copy(pos, x_start, n, stream_view.value());
+    raft::copy(pos + n, y_start, n, stream_view.value());
   }
 
   // Sort COO for coalesced memory access.
-  sort(graph, stream);
-  CHECK_CUDA(stream);
+  sort(graph, stream_view.value());
+  CHECK_CUDA(stream_view.value());
 
-  graph.degree(d_mass, cugraph::DegreeDirection::OUT);
-  CHECK_CUDA(stream);
+  graph.degree(d_mass, cugraph::legacy::DegreeDirection::OUT);
+  CHECK_CUDA(stream_view.value());
 
-  const vertex_t *row = graph.src_indices;
-  const vertex_t *col = graph.dst_indices;
-  const weight_t *v   = graph.edge_data;
+  const vertex_t* row = graph.src_indices;
+  const vertex_t* col = graph.dst_indices;
+  const weight_t* v   = graph.edge_data;
 
   float speed                     = 1.f;
   float speed_efficiency          = 1.f;
@@ -99,7 +103,7 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
   float jt                        = 0.f;
 
   if (outbound_attraction_distribution) {
-    int sum = thrust::reduce(rmm::exec_policy(stream)->on(stream), mass.begin(), mass.end());
+    int sum = thrust::reduce(rmm::exec_policy(stream_view), mass.begin(), mass.end());
     outbound_att_compensation = sum / (float)n;
   }
 
@@ -110,13 +114,14 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
 
   for (int iter = 0; iter < max_iter; ++iter) {
     // Reset force arrays
-    fill(n * 2, d_repel, 0.f);
-    fill(n * 2, d_attract, 0.f);
-    fill(n, d_swinging, 0.f);
-    fill(n, d_traction, 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), repel.begin(), repel.end(), 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), attract.begin(), attract.end(), 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), swinging.begin(), swinging.end(), 0.f);
+    thrust::fill(rmm::exec_policy(stream_view), traction.begin(), traction.end(), 0.f);
 
     // Exact repulsion
-    apply_repulsion<vertex_t>(pos, pos + n, d_repel, d_repel + n, d_mass, scaling_ratio, n, stream);
+    apply_repulsion<vertex_t>(
+      pos, pos + n, d_repel, d_repel + n, d_mass, scaling_ratio, n, stream_view.value());
 
     apply_gravity<vertex_t>(pos,
                             pos + n,
@@ -127,7 +132,7 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                             strong_gravity_mode,
                             scaling_ratio,
                             n,
-                            stream);
+                            stream_view.value());
 
     apply_attraction<vertex_t, edge_t, weight_t>(row,
                                                  col,
@@ -142,7 +147,7 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                                                  lin_log_mode,
                                                  edge_weight_influence,
                                                  outbound_att_compensation,
-                                                 stream);
+                                                 stream_view.value());
 
     compute_local_speed(d_repel,
                         d_repel + n,
@@ -154,13 +159,11 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                         d_swinging,
                         d_traction,
                         n,
-                        stream);
+                        stream_view.value());
 
     // Compute global swinging and traction values.
-    const float s =
-      thrust::reduce(rmm::exec_policy(stream)->on(stream), swinging.begin(), swinging.end());
-    const float t =
-      thrust::reduce(rmm::exec_policy(stream)->on(stream), traction.begin(), traction.end());
+    const float s = thrust::reduce(rmm::exec_policy(stream_view), swinging.begin(), swinging.end());
+    const float t = thrust::reduce(rmm::exec_policy(stream_view), traction.begin(), traction.end());
 
     adapt_speed<vertex_t>(jitter_tolerance, &jt, &speed, &speed_efficiency, s, t, n);
 
@@ -175,14 +178,14 @@ void exact_fa2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                            d_swinging,
                            speed,
                            n,
-                           stream);
+                           stream_view.value());
 
     if (callback) callback->on_epoch_end(pos);
 
     if (verbose) {
-      printf("iteration %i, speed: %f, speed_efficiency: %f, ", iter + 1, speed, speed_efficiency);
-      printf("jt: %f, ", jt);
-      printf("swinging: %f, traction: %f\n", s, t);
+      std::cout << "iteration: " << iter + 1 << ", speed: " << speed
+                << ", speed_efficiency: " << speed_efficiency << ", jt: " << jt
+                << ", swinging: " << s << ", traction: " << t << "\n";
     }
   }
 
diff --git a/cpp/src/layout/exact_repulsion.hpp b/cpp/src/layout/exact_repulsion.cuh
similarity index 77%
rename from cpp/src/layout/exact_repulsion.hpp
rename to cpp/src/layout/exact_repulsion.cuh
index 583d5c81e30..49b4f46c5c3 100644
--- a/cpp/src/layout/exact_repulsion.hpp
+++ b/cpp/src/layout/exact_repulsion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,18 +15,18 @@
  */
 
 #pragma once
-#define restrict __restrict__
+#define restrict           __restrict__
 #define CUDA_MAX_BLOCKS_2D 256
 
 namespace cugraph {
 namespace detail {
 
 template <typename vertex_t>
-__global__ void repulsion_kernel(const float *restrict x_pos,
-                                 const float *restrict y_pos,
-                                 float *restrict repel_x,
-                                 float *restrict repel_y,
-                                 const int *restrict mass,
+__global__ void repulsion_kernel(const float* restrict x_pos,
+                                 const float* restrict y_pos,
+                                 float* restrict repel_x,
+                                 float* restrict repel_y,
+                                 const int* restrict mass,
                                  const float scaling_ratio,
                                  const vertex_t n)
 {
@@ -50,11 +50,11 @@ __global__ void repulsion_kernel(const float *restrict x_pos,
 }
 
 template <typename vertex_t, int TPB_X = 32, int TPB_Y = 32>
-void apply_repulsion(const float *restrict x_pos,
-                     const float *restrict y_pos,
-                     float *restrict repel_x,
-                     float *restrict repel_y,
-                     const int *restrict mass,
+void apply_repulsion(const float* restrict x_pos,
+                     const float* restrict y_pos,
+                     float* restrict repel_x,
+                     float* restrict repel_y,
+                     const int* restrict mass,
                      const float scaling_ratio,
                      const vertex_t n,
                      cudaStream_t stream)
diff --git a/cpp/src/layout/fa2_kernels.hpp b/cpp/src/layout/fa2_kernels.cuh
similarity index 67%
rename from cpp/src/layout/fa2_kernels.hpp
rename to cpp/src/layout/fa2_kernels.cuh
index 0c7e9b1d193..c46b249628b 100644
--- a/cpp/src/layout/fa2_kernels.hpp
+++ b/cpp/src/layout/fa2_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,21 +17,21 @@
 #pragma once
 #define restrict __restrict__
 
-#include "utilities/graph_utils.cuh"
+#include <utilities/graph_utils.cuh>
 
 namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-__global__ void attraction_kernel(const vertex_t *restrict row,
-                                  const vertex_t *restrict col,
-                                  const weight_t *restrict v,
+__global__ void attraction_kernel(const vertex_t* restrict row,
+                                  const vertex_t* restrict col,
+                                  const weight_t* restrict v,
                                   const edge_t e,
-                                  const float *restrict x_pos,
-                                  const float *restrict y_pos,
-                                  float *restrict attract_x,
-                                  float *restrict attract_y,
-                                  const int *restrict mass,
+                                  const float* restrict x_pos,
+                                  const float* restrict y_pos,
+                                  float* restrict attract_x,
+                                  float* restrict attract_y,
+                                  const int* restrict mass,
                                   bool outbound_attraction_distribution,
                                   bool lin_log_mode,
                                   const float edge_weight_influence,
@@ -71,15 +71,15 @@ __global__ void attraction_kernel(const vertex_t *restrict row,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void apply_attraction(const vertex_t *restrict row,
-                      const vertex_t *restrict col,
-                      const weight_t *restrict v,
+void apply_attraction(const vertex_t* restrict row,
+                      const vertex_t* restrict col,
+                      const weight_t* restrict v,
                       const edge_t e,
-                      const float *restrict x_pos,
-                      const float *restrict y_pos,
-                      float *restrict attract_x,
-                      float *restrict attract_y,
-                      const int *restrict mass,
+                      const float* restrict x_pos,
+                      const float* restrict y_pos,
+                      float* restrict attract_x,
+                      float* restrict attract_y,
+                      const int* restrict mass,
                       bool outbound_attraction_distribution,
                       bool lin_log_mode,
                       const float edge_weight_influence,
@@ -116,11 +116,11 @@ void apply_attraction(const vertex_t *restrict row,
 }
 
 template <typename vertex_t>
-__global__ void linear_gravity_kernel(const float *restrict x_pos,
-                                      const float *restrict y_pos,
-                                      float *restrict attract_x,
-                                      float *restrict attract_y,
-                                      const int *restrict mass,
+__global__ void linear_gravity_kernel(const float* restrict x_pos,
+                                      const float* restrict y_pos,
+                                      float* restrict attract_x,
+                                      float* restrict attract_y,
+                                      const int* restrict mass,
                                       const float gravity,
                                       const vertex_t n)
 {
@@ -136,11 +136,11 @@ __global__ void linear_gravity_kernel(const float *restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void strong_gravity_kernel(const float *restrict x_pos,
-                                      const float *restrict y_pos,
-                                      float *restrict attract_x,
-                                      float *restrict attract_y,
-                                      const int *restrict mass,
+__global__ void strong_gravity_kernel(const float* restrict x_pos,
+                                      const float* restrict y_pos,
+                                      float* restrict attract_x,
+                                      float* restrict attract_y,
+                                      const int* restrict mass,
                                       const float gravity,
                                       const float scaling_ratio,
                                       const vertex_t n)
@@ -157,11 +157,11 @@ __global__ void strong_gravity_kernel(const float *restrict x_pos,
 }
 
 template <typename vertex_t>
-void apply_gravity(const float *restrict x_pos,
-                   const float *restrict y_pos,
-                   float *restrict attract_x,
-                   float *restrict attract_y,
-                   const int *restrict mass,
+void apply_gravity(const float* restrict x_pos,
+                   const float* restrict y_pos,
+                   float* restrict attract_x,
+                   float* restrict attract_y,
+                   const int* restrict mass,
                    const float gravity,
                    bool strong_gravity_mode,
                    const float scaling_ratio,
@@ -187,15 +187,15 @@ void apply_gravity(const float *restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void local_speed_kernel(const float *restrict repel_x,
-                                   const float *restrict repel_y,
-                                   const float *restrict attract_x,
-                                   const float *restrict attract_y,
-                                   const float *restrict old_dx,
-                                   const float *restrict old_dy,
-                                   const int *restrict mass,
-                                   float *restrict swinging,
-                                   float *restrict traction,
+__global__ void local_speed_kernel(const float* restrict repel_x,
+                                   const float* restrict repel_y,
+                                   const float* restrict attract_x,
+                                   const float* restrict attract_y,
+                                   const float* restrict old_dx,
+                                   const float* restrict old_dy,
+                                   const int* restrict mass,
+                                   float* restrict swinging,
+                                   float* restrict traction,
                                    const vertex_t n)
 {
   // For every node.
@@ -210,15 +210,15 @@ __global__ void local_speed_kernel(const float *restrict repel_x,
 }
 
 template <typename vertex_t>
-void compute_local_speed(const float *restrict repel_x,
-                         const float *restrict repel_y,
-                         const float *restrict attract_x,
-                         const float *restrict attract_y,
-                         float *restrict old_dx,
-                         float *restrict old_dy,
-                         const int *restrict mass,
-                         float *restrict swinging,
-                         float *restrict traction,
+void compute_local_speed(const float* restrict repel_x,
+                         const float* restrict repel_y,
+                         const float* restrict attract_x,
+                         const float* restrict attract_y,
+                         float* restrict old_dx,
+                         float* restrict old_dy,
+                         const int* restrict mass,
+                         float* restrict swinging,
+                         float* restrict traction,
                          const vertex_t n,
                          cudaStream_t stream)
 {
@@ -237,9 +237,9 @@ void compute_local_speed(const float *restrict repel_x,
 
 template <typename vertex_t>
 void adapt_speed(const float jitter_tolerance,
-                 float *restrict jt,
-                 float *restrict speed,
-                 float *restrict speed_efficiency,
+                 float* restrict jt,
+                 float* restrict speed,
+                 float* restrict speed_efficiency,
                  const float s,
                  const float t,
                  const vertex_t n)
@@ -272,15 +272,15 @@ void adapt_speed(const float jitter_tolerance,
 }
 
 template <typename vertex_t>
-__global__ void update_positions_kernel(float *restrict x_pos,
-                                        float *restrict y_pos,
-                                        const float *restrict repel_x,
-                                        const float *restrict repel_y,
-                                        const float *restrict attract_x,
-                                        const float *restrict attract_y,
-                                        float *restrict old_dx,
-                                        float *restrict old_dy,
-                                        const float *restrict swinging,
+__global__ void update_positions_kernel(float* restrict x_pos,
+                                        float* restrict y_pos,
+                                        const float* restrict repel_x,
+                                        const float* restrict repel_y,
+                                        const float* restrict attract_x,
+                                        const float* restrict attract_y,
+                                        float* restrict old_dx,
+                                        float* restrict old_dy,
+                                        const float* restrict swinging,
                                         const float speed,
                                         const vertex_t n)
 {
@@ -298,15 +298,15 @@ __global__ void update_positions_kernel(float *restrict x_pos,
 }
 
 template <typename vertex_t>
-void apply_forces(float *restrict x_pos,
-                  float *restrict y_pos,
-                  const float *restrict repel_x,
-                  const float *restrict repel_y,
-                  const float *restrict attract_x,
-                  const float *restrict attract_y,
-                  float *restrict old_dx,
-                  float *restrict old_dy,
-                  const float *restrict swinging,
+void apply_forces(float* restrict x_pos,
+                  float* restrict y_pos,
+                  const float* restrict repel_x,
+                  const float* restrict repel_y,
+                  const float* restrict attract_x,
+                  const float* restrict attract_y,
+                  float* restrict old_dx,
+                  float* restrict old_dy,
+                  const float* restrict swinging,
                   const float speed,
                   const vertex_t n,
                   cudaStream_t stream)
diff --git a/cpp/src/layout/force_atlas2.cu b/cpp/src/layout/force_atlas2.cu
index ef00f504d86..9dc4ee286b0 100644
--- a/cpp/src/layout/force_atlas2.cu
+++ b/cpp/src/layout/force_atlas2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "barnes_hut.hpp"
-#include "exact_fa2.hpp"
+#include "barnes_hut.cuh"
+#include "exact_fa2.cuh"
 
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
-                  float *pos,
+void force_atlas2(raft::handle_t const& handle,
+                  legacy::GraphCOOView<vertex_t, edge_t, weight_t>& graph,
+                  float* pos,
                   const int max_iter,
-                  float *x_start,
-                  float *y_start,
+                  float* x_start,
+                  float* y_start,
                   bool outbound_attraction_distribution,
                   bool lin_log_mode,
                   bool prevent_overlapping,
@@ -36,13 +37,14 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                   bool strong_gravity_mode,
                   const float gravity,
                   bool verbose,
-                  internals::GraphBasedDimRedCallback *callback)
+                  internals::GraphBasedDimRedCallback* callback)
 {
-  CUGRAPH_EXPECTS(pos != nullptr, "Invalid API parameter: pos array should be of size 2 * V");
+  CUGRAPH_EXPECTS(pos != nullptr, "Invalid input argument: pos array should be of size 2 * V");
   CUGRAPH_EXPECTS(graph.number_of_vertices != 0, "Invalid input: Graph is empty");
 
   if (!barnes_hut_optimize) {
-    cugraph::detail::exact_fa2<vertex_t, edge_t, weight_t>(graph,
+    cugraph::detail::exact_fa2<vertex_t, edge_t, weight_t>(handle,
+                                                           graph,
                                                            pos,
                                                            max_iter,
                                                            x_start,
@@ -58,7 +60,8 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                                                            verbose,
                                                            callback);
   } else {
-    cugraph::detail::barnes_hut<vertex_t, edge_t, weight_t>(graph,
+    cugraph::detail::barnes_hut<vertex_t, edge_t, weight_t>(handle,
+                                                            graph,
                                                             pos,
                                                             max_iter,
                                                             x_start,
@@ -77,11 +80,12 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
   }
 }
 
-template void force_atlas2<int, int, float>(GraphCOOView<int, int, float> &graph,
-                                            float *pos,
+template void force_atlas2<int, int, float>(raft::handle_t const& handle,
+                                            legacy::GraphCOOView<int, int, float>& graph,
+                                            float* pos,
                                             const int max_iter,
-                                            float *x_start,
-                                            float *y_start,
+                                            float* x_start,
+                                            float* y_start,
                                             bool outbound_attraction_distribution,
                                             bool lin_log_mode,
                                             bool prevent_overlapping,
@@ -93,13 +97,14 @@ template void force_atlas2<int, int, float>(GraphCOOView<int, int, float> &graph
                                             bool strong_gravity_mode,
                                             const float gravity,
                                             bool verbose,
-                                            internals::GraphBasedDimRedCallback *callback);
+                                            internals::GraphBasedDimRedCallback* callback);
 
-template void force_atlas2<int, int, double>(GraphCOOView<int, int, double> &graph,
-                                             float *pos,
+template void force_atlas2<int, int, double>(raft::handle_t const& handle,
+                                             legacy::GraphCOOView<int, int, double>& graph,
+                                             float* pos,
                                              const int max_iter,
-                                             float *x_start,
-                                             float *y_start,
+                                             float* x_start,
+                                             float* y_start,
                                              bool outbound_attraction_distribution,
                                              bool lin_log_mode,
                                              bool prevent_overlapping,
@@ -111,6 +116,6 @@ template void force_atlas2<int, int, double>(GraphCOOView<int, int, double> &gra
                                              bool strong_gravity_mode,
                                              const float gravity,
                                              bool verbose,
-                                             internals::GraphBasedDimRedCallback *callback);
+                                             internals::GraphBasedDimRedCallback* callback);
 
 }  // namespace cugraph
diff --git a/cpp/src/layout/utils.hpp b/cpp/src/layout/utils.hpp
index 335b8ea986c..ffbeb291e58 100644
--- a/cpp/src/layout/utils.hpp
+++ b/cpp/src/layout/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,27 +18,9 @@
 
 #include <raft/cudart_utils.h>
 
-#include <thrust/random.h>
-
 namespace cugraph {
 namespace detail {
 
-struct prg {
-  __host__ __device__ float operator()(int n)
-  {
-    thrust::default_random_engine rng;
-    thrust::uniform_real_distribution<float> dist(-100.f, 100.f);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-void random_vector(float *vec, int n, int seed, cudaStream_t stream)
-{
-  thrust::counting_iterator<uint32_t> index(seed);
-  thrust::transform(rmm::exec_policy(stream)->on(stream), index, index + n, vec, prg());
-}
-
 /** helper method to get multi-processor count parameter */
 inline int getMultiProcessorCount()
 {
diff --git a/cpp/src/linear_assignment/README-hungarian.md b/cpp/src/linear_assignment/README-hungarian.md
new file mode 100644
index 00000000000..42dabd7cfbc
--- /dev/null
+++ b/cpp/src/linear_assignment/README-hungarian.md
@@ -0,0 +1,36 @@
+# LAP
+Implementation of ***O(n^3) Alternating Tree Variant*** of Hungarian Algorithm on NVIDIA CUDA-enabled GPU.
+
+This implementation solves a batch of ***k*** **Linear Assignment Problems (LAP)**, each with ***nxn*** matrix of single floating point cost values. At optimality, the algorithm produces an assignment with ***minimum*** cost.
+
+The API can be used to query optimal primal and dual costs, optimal assignment vector, and optimal row/column dual vectors for each subproblem in the batch.
+
+cuGraph exposes the Hungarian algorithm, the actual implementation is contained in the RAFT library which contains some common tools and kernels shared between cuGraph and cuML.
+
+Following parameters can be used to tune the performance of algorithm:
+
+1. epsilon: (in raft/lap/lap_kernels.cuh) This parameter controls the tolerance on the floating point precision. Setting this too small will result in increased solution time because the algorithm will search for precise solutions. Setting it too high may cause some inaccuracies.
+
+2. BLOCKDIMX, BLOCKDIMY: (in raft/lap/lap_functions.cuh) These parameters control threads_per_block to be used along the given dimension. Set these according to the device specifications and occupancy calculation.
+
+***This library is licensed under Apache License 2.0. Please cite our paper, if this library helps you in your research.***
+
+- Harvard citation style
+
+  Date, K. and Nagi, R., 2016. GPU-accelerated Hungarian algorithms for the Linear Assignment Problem. Parallel Computing, 57, pp.52-72.
+
+- BibTeX Citation block to be used in LaTeX bibliography file:
+
+```
+@article{date2016gpu,
+  title={GPU-accelerated Hungarian algorithms for the Linear Assignment Problem},
+  author={Date, Ketan and Nagi, Rakesh},
+  journal={Parallel Computing},
+  volume={57},
+  pages={52--72},
+  year={2016},
+  publisher={Elsevier}
+}
+```
+
+The paper is available online on [ScienceDirect](https://www.sciencedirect.com/science/article/abs/pii/S016781911630045X).
diff --git a/cpp/src/linear_assignment/hungarian.cu b/cpp/src/linear_assignment/hungarian.cu
index 164a386c6dd..368e119e93c 100644
--- a/cpp/src/linear_assignment/hungarian.cu
+++ b/cpp/src/linear_assignment/hungarian.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,18 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
-#include <limits>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/lap/lap.cuh>
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <graph.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/random.h>
 
-#include <utilities/error.hpp>
-
-#include <raft/lap/lap.cuh>
+#include <iostream>
+#include <limits>
 
 //#define TIMING
 
@@ -35,41 +37,95 @@
 namespace cugraph {
 namespace detail {
 
-template <typename index_t, typename weight_t>
-weight_t hungarian(raft::handle_t const &handle,
-                   index_t num_rows,
-                   index_t num_cols,
-                   weight_t const *d_original_cost,
-                   index_t *d_assignment,
-                   cudaStream_t stream)
+template <typename weight_t>
+weight_t default_epsilon()
 {
-  //
-  //  TODO:  Can Date/Nagi implementation in raft handle rectangular matrices?
-  //
-  CUGRAPH_EXPECTS(num_rows == num_cols, "Current implementation only supports square matrices");
-
-  rmm::device_vector<index_t> col_assignments_v(num_rows);
+  return 0;
+}
 
-  // Create an instance of LinearAssignmentProblem using problem size, number of subproblems
-  raft::lap::LinearAssignmentProblem<index_t, weight_t> lpx(handle, num_rows, 1);
+template <>
+float default_epsilon()
+{
+  return float{1e-6};
+}
 
-  // Solve LAP(s) for given cost matrix
-  lpx.solve(d_original_cost, d_assignment, col_assignments_v.data().get());
+template <>
+double default_epsilon()
+{
+  return double{1e-6};
+}
 
-  return lpx.getPrimalObjectiveValue(0);
+template <typename index_t, typename weight_t>
+weight_t hungarian(raft::handle_t const& handle,
+                   index_t num_rows,
+                   index_t num_cols,
+                   weight_t const* d_original_cost,
+                   index_t* d_assignment,
+                   weight_t epsilon)
+{
+  if (num_rows == num_cols) {
+    rmm::device_uvector<index_t> col_assignments_v(num_rows, handle.get_stream_view());
+
+    // Create an instance of LinearAssignmentProblem using problem size, number of subproblems
+    raft::lap::LinearAssignmentProblem<index_t, weight_t> lpx(handle, num_rows, 1, epsilon);
+
+    // Solve LAP(s) for given cost matrix
+    lpx.solve(d_original_cost, d_assignment, col_assignments_v.data());
+
+    return lpx.getPrimalObjectiveValue(0);
+  } else {
+    //
+    //  Create a square matrix, copy d_original_cost into it.
+    //  Fill the extra rows/columns with max(d_original_cost)
+    //
+    index_t n         = std::max(num_rows, num_cols);
+    weight_t max_cost = thrust::reduce(rmm::exec_policy(handle.get_stream_view()),
+                                       d_original_cost,
+                                       d_original_cost + (num_rows * num_cols),
+                                       weight_t{0},
+                                       thrust::maximum<weight_t>());
+
+    rmm::device_uvector<weight_t> tmp_cost_v(n * n, handle.get_stream_view());
+    rmm::device_uvector<index_t> tmp_row_assignment_v(n, handle.get_stream_view());
+    rmm::device_uvector<index_t> tmp_col_assignment_v(n, handle.get_stream_view());
+
+    thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                      thrust::make_counting_iterator<index_t>(0),
+                      thrust::make_counting_iterator<index_t>(n * n),
+                      tmp_cost_v.begin(),
+                      [max_cost, d_original_cost, n, num_rows, num_cols] __device__(index_t i) {
+                        index_t row = i / n;
+                        index_t col = i % n;
+
+                        return ((row < num_rows) && (col < num_cols))
+                                 ? d_original_cost[row * num_cols + col]
+                                 : max_cost;
+                      });
+
+    raft::lap::LinearAssignmentProblem<index_t, weight_t> lpx(handle, n, 1, epsilon);
+
+    // Solve LAP(s) for given cost matrix
+    lpx.solve(tmp_cost_v.begin(), tmp_row_assignment_v.begin(), tmp_col_assignment_v.begin());
+
+    weight_t tmp_objective_value = lpx.getPrimalObjectiveValue(0);
+
+    raft::copy(d_assignment, tmp_row_assignment_v.begin(), num_rows, handle.get_stream());
+
+    return tmp_objective_value - max_cost * std::abs(num_rows - num_cols);
+  }
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t hungarian_sparse(raft::handle_t const &handle,
-                          GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
+weight_t hungarian_sparse(raft::handle_t const& handle,
+                          legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
                           vertex_t num_workers,
-                          vertex_t const *workers,
-                          vertex_t *assignment,
-                          cudaStream_t stream)
+                          vertex_t const* workers,
+                          vertex_t* assignment,
+                          weight_t epsilon)
 {
-  CUGRAPH_EXPECTS(assignment != nullptr, "Invalid API parameter: assignment pointer is NULL");
+  CUGRAPH_EXPECTS(assignment != nullptr, "Invalid input argument: assignment pointer is NULL");
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
-                  "Invalid API parameter: graph must have edge data (costs)");
+                  "Invalid input argument: graph must have edge data (costs)");
 
 #ifdef TIMING
   HighResTimer hr_timer;
@@ -86,63 +142,68 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
 
   vertex_t matrix_dimension = std::max(num_rows, num_cols);
 
-  rmm::device_vector<weight_t> cost_v(matrix_dimension * matrix_dimension);
-  rmm::device_vector<vertex_t> tasks_v(num_cols);
-  rmm::device_vector<vertex_t> temp_tasks_v(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> temp_workers_v(graph.number_of_vertices);
+  rmm::device_uvector<weight_t> cost_v(matrix_dimension * matrix_dimension,
+                                       handle.get_stream_view());
+  rmm::device_uvector<vertex_t> tasks_v(num_cols, handle.get_stream_view());
+  rmm::device_uvector<vertex_t> temp_tasks_v(graph.number_of_vertices, handle.get_stream_view());
+  rmm::device_uvector<vertex_t> temp_workers_v(graph.number_of_vertices, handle.get_stream_view());
 
-  weight_t *d_cost         = cost_v.data().get();
-  vertex_t *d_tasks        = tasks_v.data().get();
-  vertex_t *d_temp_tasks   = temp_tasks_v.data().get();
-  vertex_t *d_temp_workers = temp_workers_v.data().get();
-  vertex_t *d_src_indices  = graph.src_indices;
-  vertex_t *d_dst_indices  = graph.dst_indices;
-  weight_t *d_edge_data    = graph.edge_data;
+  weight_t* d_cost         = cost_v.data();
+  vertex_t* d_tasks        = tasks_v.data();
+  vertex_t* d_temp_tasks   = temp_tasks_v.data();
+  vertex_t* d_temp_workers = temp_workers_v.data();
+  vertex_t* d_src_indices  = graph.src_indices;
+  vertex_t* d_dst_indices  = graph.dst_indices;
+  weight_t* d_edge_data    = graph.edge_data;
 
   //
   //  Renumber vertices internally.  Workers will become
   //  rows, tasks will become columns
   //
-  thrust::sequence(rmm::exec_policy(stream)->on(stream), temp_tasks_v.begin(), temp_tasks_v.end());
+  thrust::sequence(
+    rmm::exec_policy(handle.get_stream_view()), temp_tasks_v.begin(), temp_tasks_v.end());
 
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
                    workers,
                    workers + num_workers,
                    [d_temp_tasks] __device__(vertex_t v) { d_temp_tasks[v] = -1; });
 
-  auto temp_end = thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+  auto temp_end = thrust::copy_if(rmm::exec_policy(handle.get_stream_view()),
                                   temp_tasks_v.begin(),
                                   temp_tasks_v.end(),
                                   d_tasks,
                                   [] __device__(vertex_t v) { return v >= 0; });
 
   vertex_t size = thrust::distance(d_tasks, temp_end);
-  tasks_v.resize(size);
+  tasks_v.resize(size, handle.get_stream_view());
 
   //
   // Now we'll assign costs into the dense array
   //
-  thrust::fill(rmm::exec_policy(stream)->on(stream),
+  thrust::fill(rmm::exec_policy(handle.get_stream_view()),
                temp_workers_v.begin(),
                temp_workers_v.end(),
                vertex_t{-1});
+  thrust::fill(rmm::exec_policy(handle.get_stream_view()),
+               temp_tasks_v.begin(),
+               temp_tasks_v.end(),
+               vertex_t{-1});
   thrust::fill(
-    rmm::exec_policy(stream)->on(stream), temp_tasks_v.begin(), temp_tasks_v.end(), vertex_t{-1});
-  thrust::fill(rmm::exec_policy(stream)->on(stream), cost_v.begin(), cost_v.end(), weight_t{0});
+    rmm::exec_policy(handle.get_stream_view()), cost_v.begin(), cost_v.end(), weight_t{0});
 
   thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(handle.get_stream_view()),
     thrust::make_counting_iterator<vertex_t>(0),
     thrust::make_counting_iterator<vertex_t>(num_rows),
     [d_temp_workers, workers] __device__(vertex_t v) { d_temp_workers[workers[v]] = v; });
 
   thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(handle.get_stream_view()),
     thrust::make_counting_iterator<vertex_t>(0),
     thrust::make_counting_iterator<vertex_t>(num_cols),
     [d_temp_tasks, d_tasks] __device__(vertex_t v) { d_temp_tasks[d_tasks[v]] = v; });
 
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
                    thrust::make_counting_iterator<edge_t>(0),
                    thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
                    [d_temp_workers,
@@ -170,11 +231,11 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
   //  temp_assignment_v will hold the assignment in the dense
   //  bipartite matrix numbering
   //
-  rmm::device_vector<vertex_t> temp_assignment_v(matrix_dimension);
-  vertex_t *d_temp_assignment = temp_assignment_v.data().get();
+  rmm::device_uvector<vertex_t> temp_assignment_v(matrix_dimension, handle.get_stream_view());
+  vertex_t* d_temp_assignment = temp_assignment_v.data();
 
   weight_t min_cost = detail::hungarian(
-    handle, matrix_dimension, matrix_dimension, d_cost, d_temp_assignment, stream);
+    handle, matrix_dimension, matrix_dimension, d_cost, d_temp_assignment, epsilon);
 
 #ifdef TIMING
   hr_timer.stop();
@@ -185,7 +246,7 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
   //
   //  Translate the assignment back to the original vertex ids
   //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
                    thrust::make_counting_iterator<vertex_t>(0),
                    thrust::make_counting_iterator<vertex_t>(num_rows),
                    [d_tasks, d_temp_assignment, assignment] __device__(vertex_t id) {
@@ -204,54 +265,106 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t hungarian(raft::handle_t const &handle,
-                   GraphCOOView<vertex_t, edge_t, weight_t> const &graph,
+weight_t hungarian(raft::handle_t const& handle,
+                   legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
                    vertex_t num_workers,
-                   vertex_t const *workers,
-                   vertex_t *assignment)
+                   vertex_t const* workers,
+                   vertex_t* assignment)
 {
-  cudaStream_t stream{0};
+  return detail::hungarian_sparse(
+    handle, graph, num_workers, workers, assignment, detail::default_epsilon<weight_t>());
+}
 
-  return detail::hungarian_sparse(handle, graph, num_workers, workers, assignment, stream);
+template <typename vertex_t, typename edge_t, typename weight_t>
+weight_t hungarian(raft::handle_t const& handle,
+                   legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
+                   vertex_t num_workers,
+                   vertex_t const* workers,
+                   vertex_t* assignment,
+                   weight_t epsilon)
+{
+  return detail::hungarian_sparse(handle, graph, num_workers, workers, assignment, epsilon);
 }
 
 template int32_t hungarian<int32_t, int32_t, int32_t>(
-  raft::handle_t const &,
-  GraphCOOView<int32_t, int32_t, int32_t> const &,
+  raft::handle_t const&,
+  legacy::GraphCOOView<int32_t, int32_t, int32_t> const&,
+  int32_t,
+  int32_t const*,
+  int32_t*,
+  int32_t);
+
+template float hungarian<int32_t, int32_t, float>(
+  raft::handle_t const&,
+  legacy::GraphCOOView<int32_t, int32_t, float> const&,
+  int32_t,
+  int32_t const*,
+  int32_t*,
+  float);
+template double hungarian<int32_t, int32_t, double>(
+  raft::handle_t const&,
+  legacy::GraphCOOView<int32_t, int32_t, double> const&,
+  int32_t,
+  int32_t const*,
+  int32_t*,
+  double);
+
+template int32_t hungarian<int32_t, int32_t, int32_t>(
+  raft::handle_t const&,
+  legacy::GraphCOOView<int32_t, int32_t, int32_t> const&,
   int32_t,
-  int32_t const *,
-  int32_t *);
-template float hungarian<int32_t, int32_t, float>(raft::handle_t const &,
-                                                  GraphCOOView<int32_t, int32_t, float> const &,
-                                                  int32_t,
-                                                  int32_t const *,
-                                                  int32_t *);
-template double hungarian<int32_t, int32_t, double>(raft::handle_t const &,
-                                                    GraphCOOView<int32_t, int32_t, double> const &,
-                                                    int32_t,
-                                                    int32_t const *,
-                                                    int32_t *);
+  int32_t const*,
+  int32_t*);
+
+template float hungarian<int32_t, int32_t, float>(
+  raft::handle_t const&,
+  legacy::GraphCOOView<int32_t, int32_t, float> const&,
+  int32_t,
+  int32_t const*,
+  int32_t*);
+template double hungarian<int32_t, int32_t, double>(
+  raft::handle_t const&,
+  legacy::GraphCOOView<int32_t, int32_t, double> const&,
+  int32_t,
+  int32_t const*,
+  int32_t*);
 
 namespace dense {
 
 template <typename index_t, typename weight_t>
-weight_t hungarian(raft::handle_t const &handle,
-                   weight_t const *costs,
+weight_t hungarian(raft::handle_t const& handle,
+                   weight_t const* costs,
                    index_t num_rows,
                    index_t num_cols,
-                   index_t *assignment)
+                   index_t* assignment)
 {
-  cudaStream_t stream{0};
+  return detail::hungarian(
+    handle, num_rows, num_cols, costs, assignment, detail::default_epsilon<weight_t>());
+}
 
-  return detail::hungarian(handle, num_rows, num_cols, costs, assignment, stream);
+template <typename index_t, typename weight_t>
+weight_t hungarian(raft::handle_t const& handle,
+                   weight_t const* costs,
+                   index_t num_rows,
+                   index_t num_cols,
+                   index_t* assignment,
+                   weight_t epsilon)
+{
+  return detail::hungarian(handle, num_rows, num_cols, costs, assignment, epsilon);
 }
 
 template int32_t hungarian<int32_t, int32_t>(
-  raft::handle_t const &, int32_t const *, int32_t, int32_t, int32_t *);
+  raft::handle_t const&, int32_t const*, int32_t, int32_t, int32_t*);
+template float hungarian<int32_t, float>(
+  raft::handle_t const&, float const*, int32_t, int32_t, int32_t*);
+template double hungarian<int32_t, double>(
+  raft::handle_t const&, double const*, int32_t, int32_t, int32_t*);
+template int32_t hungarian<int32_t, int32_t>(
+  raft::handle_t const&, int32_t const*, int32_t, int32_t, int32_t*, int32_t);
 template float hungarian<int32_t, float>(
-  raft::handle_t const &, float const *, int32_t, int32_t, int32_t *);
+  raft::handle_t const&, float const*, int32_t, int32_t, int32_t*, float);
 template double hungarian<int32_t, double>(
-  raft::handle_t const &, double const *, int32_t, int32_t, int32_t *);
+  raft::handle_t const&, double const*, int32_t, int32_t, int32_t*, double);
 
 }  // namespace dense
 
diff --git a/cpp/src/link_analysis/gunrock_hits.cpp b/cpp/src/link_analysis/gunrock_hits.cpp
index 8662c3bea79..a86210cc521 100644
--- a/cpp/src/link_analysis/gunrock_hits.cpp
+++ b/cpp/src/link_analysis/gunrock_hits.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,10 @@
  * @brief wrapper calling gunrock's HITS analytic
  * --------------------------------------------------------------------------*/
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <gunrock/gunrock.h>
 
@@ -34,17 +34,17 @@ const int HOST{1};    // gunrock should expose the device constant at the API le
 const int DEVICE{2};  // gunrock should expose the device constant at the API level.
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void hits(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void hits(cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
           int max_iter,
           weight_t tolerance,
-          weight_t const *starting_value,
+          weight_t const* starting_value,
           bool normalized,
-          weight_t *hubs,
-          weight_t *authorities)
+          weight_t* hubs,
+          weight_t* authorities)
 {
-  CUGRAPH_EXPECTS(hubs != nullptr, "Invalid API parameter: hubs array should be of size V");
+  CUGRAPH_EXPECTS(hubs != nullptr, "Invalid input argument: hubs array should be of size V");
   CUGRAPH_EXPECTS(authorities != nullptr,
-                  "Invalid API parameter: authorities array should be of size V");
+                  "Invalid input argument: authorities array should be of size V");
 
   //
   //  NOTE:  gunrock doesn't support passing a starting value
@@ -61,13 +61,13 @@ void hits(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
          DEVICE);
 }
 
-template void hits(cugraph::GraphCSRView<int32_t, int32_t, float> const &,
+template void hits(cugraph::legacy::GraphCSRView<int32_t, int32_t, float> const&,
                    int,
                    float,
-                   float const *,
+                   float const*,
                    bool,
-                   float *,
-                   float *);
+                   float*,
+                   float*);
 
 }  // namespace gunrock
 }  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu
deleted file mode 100644
index e5da24e328d..00000000000
--- a/cpp/src/link_analysis/pagerank.cu
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Pagerank solver
-// Author: Alex Fender afender@nvidia.com
-
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include "cub/cub.cuh"
-
-#include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
-
-#include <graph.hpp>
-#include "pagerank_1D.cuh"
-#include "utilities/graph_utils.cuh"
-
-#include <raft/spectral/matrix_wrappers.hpp>
-
-namespace cugraph {
-namespace detail {
-
-#ifdef DEBUG
-#define PR_VERBOSE
-#endif
-
-template <typename IndexType, typename ValueType>
-bool pagerankIteration(raft::handle_t const &handle,
-                       IndexType n,
-                       IndexType e,
-                       IndexType const *cscPtr,
-                       IndexType const *cscInd,
-                       ValueType *cscVal,
-                       ValueType alpha,
-                       ValueType *a,
-                       ValueType *b,
-                       float tolerance,
-                       int iter,
-                       int max_iter,
-                       ValueType *&tmp,
-                       void *cub_d_temp_storage,
-                       size_t cub_temp_storage_bytes,
-                       ValueType *&pr,
-                       ValueType *residual)
-{
-  ValueType dot_res;
-//#if defined(CUDART_VERSION) and CUDART_VERSION >= 11000
-#if 1
-  {
-    raft::matrix::sparse_matrix_t<IndexType, ValueType> const r_csr_m{
-      handle, cscPtr, cscInd, cscVal, n, e};
-    r_csr_m.mv(1.0, tmp, 0.0, pr);
-  }
-#else
-  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage,
-                                  cub_temp_storage_bytes,
-                                  cscVal,
-                                  (IndexType *)cscPtr,
-                                  (IndexType *)cscInd,
-                                  tmp,
-                                  pr,
-                                  n,
-                                  n,
-                                  e));
-#endif
-  scal(n, alpha, pr);
-  dot_res = dot(n, a, tmp);
-  axpy(n, dot_res, b, pr);
-  scal(n, (ValueType)1.0 / nrm2(n, pr), pr);
-  axpy(n, (ValueType)-1.0, pr, tmp);
-  *residual = nrm2(n, tmp);
-  if (*residual < tolerance) {
-    scal(n, (ValueType)1.0 / nrm1(n, pr), pr);
-    return true;
-  } else {
-    if (iter < max_iter) {
-      // FIXME: Copy the pagerank vector results to the tmp vector, since there
-      // are still raw pointers in pagerank pointing to tmp vector locations
-      // that were std::swapped out in the solver.  A thrust::swap would
-      // probably be more efficent if the vectors were passed everywhere instead
-      // of pointers. std::swap is unsafe though. Just copying for now, as this
-      // may soon be replaced by the pattern accelerator.
-      copy(n, pr, tmp);
-    } else {
-      scal(n, (ValueType)1.0 / nrm1(n, pr), pr);
-    }
-    return false;
-  }
-}
-
-template <typename IndexType, typename ValueType>
-int pagerankSolver(raft::handle_t const &handle,
-                   IndexType n,
-                   IndexType e,
-                   IndexType const *cscPtr,
-                   IndexType const *cscInd,
-                   ValueType *cscVal,
-                   IndexType *prsVtx,
-                   ValueType *prsVal,
-                   IndexType prsLen,
-                   bool has_personalization,
-                   ValueType alpha,
-                   ValueType *a,
-                   bool has_guess,
-                   float tolerance,
-                   int max_iter,
-                   ValueType *&pagerank_vector,
-                   ValueType *&residual)
-{
-  int max_it, i = 0;
-  float tol;
-  bool converged              = false;
-  ValueType randomProbability = static_cast<ValueType>(1.0 / n);
-  ValueType *tmp_d{nullptr};
-  ValueType *b_d{nullptr};
-  void *cub_d_temp_storage      = NULL;
-  size_t cub_temp_storage_bytes = 0;
-
-  if (max_iter > 0)
-    max_it = max_iter;
-  else
-    max_it = 500;
-
-  if (tolerance == 0.0f)
-    tol = 1.0E-6f;
-  else if (tolerance < 1.0f && tolerance > 0.0f)
-    tol = tolerance;
-  else
-    return -1;
-
-  if (alpha <= 0.0f || alpha >= 1.0f) return -1;
-
-  rmm::device_vector<ValueType> b(n);
-  b_d = b.data().get();
-
-#if 1 /* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
-  thrust::device_vector<ValueType> tmp(n);
-  tmp_d = tmp.data().get();
-#else
-  rmm::device_vector<WT> tmp(n);
-  tmp_d = pr.data().get();
-#endif
-  // FIXME: this should take a passed CUDA strema instead of default nullptr
-  CHECK_CUDA(nullptr);
-
-  if (!has_guess) {
-    fill(n, pagerank_vector, randomProbability);
-    fill(n, tmp_d, randomProbability);
-  } else {
-    copy(n, pagerank_vector, tmp_d);
-  }
-
-  if (has_personalization) {
-    ValueType sum = nrm1(prsLen, prsVal);
-    if (static_cast<ValueType>(0) == sum) {
-      fill(n, b_d, randomProbability);
-    } else {
-      scal(n, static_cast<ValueType>(1.0 / sum), prsVal);
-      fill(n, b_d, static_cast<ValueType>(0));
-      scatter(prsLen, prsVal, b_d, prsVtx);
-    }
-  } else {
-    fill(n, b_d, randomProbability);
-  }
-  update_dangling_nodes(n, a, alpha);
-
-//#if defined(CUDART_VERSION) and CUDART_VERSION >= 11000
-#if 1
-  {
-    raft::matrix::sparse_matrix_t<IndexType, ValueType> const r_csr_m{
-      handle, cscPtr, cscInd, cscVal, n, e};
-    r_csr_m.mv(1.0, tmp_d, 0.0, pagerank_vector);
-  }
-#else
-  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage,
-                                  cub_temp_storage_bytes,
-                                  cscVal,
-                                  (IndexType *)cscPtr,
-                                  (IndexType *)cscInd,
-                                  tmp_d,
-                                  pagerank_vector,
-                                  n,
-                                  n,
-                                  e));
-#endif
-  // Allocate temporary storage
-  rmm::device_buffer cub_temp_storage(cub_temp_storage_bytes);
-  cub_d_temp_storage = cub_temp_storage.data();
-
-#ifdef PR_VERBOSE
-  std::stringstream ss;
-  ss.str(std::string());
-  ss << " ------------------PageRank------------------" << std::endl;
-  ss << " --------------------------------------------" << std::endl;
-  ss << std::setw(10) << "Iteration" << std::setw(15) << "Residual" << std::endl;
-  ss << " --------------------------------------------" << std::endl;
-  std::cout << ss.str();
-#endif
-
-  while (!converged && i < max_it) {
-    i++;
-    converged = pagerankIteration<IndexType, ValueType>(handle,
-                                                        n,
-                                                        e,
-                                                        cscPtr,
-                                                        cscInd,
-                                                        cscVal,
-                                                        alpha,
-                                                        a,
-                                                        b_d,
-                                                        tol,
-                                                        i,
-                                                        max_it,
-                                                        tmp_d,
-                                                        cub_d_temp_storage,
-                                                        cub_temp_storage_bytes,
-                                                        pagerank_vector,
-                                                        residual);
-#ifdef PR_VERBOSE
-    ss.str(std::string());
-    ss << std::setw(10) << i;
-    ss.precision(3);
-    ss << std::setw(15) << std::scientific << *residual << std::endl;
-    std::cout << ss.str();
-#endif
-  }
-#ifdef PR_VERBOSE
-  std::cout << " --------------------------------------------" << std::endl;
-#endif
-
-  return converged ? 0 : 1;
-}
-
-// template int pagerankSolver<int, half> (  int n, int e, int *cscPtr, int *cscInd,half *cscVal,
-// half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half
-// * &residual);
-template int pagerankSolver<int, float>(raft::handle_t const &handle,
-                                        int n,
-                                        int e,
-                                        int const *cscPtr,
-                                        int const *cscInd,
-                                        float *cscVal,
-                                        int *prsVtx,
-                                        float *prsVal,
-                                        int prsLen,
-                                        bool has_personalization,
-                                        float alpha,
-                                        float *a,
-                                        bool has_guess,
-                                        float tolerance,
-                                        int max_iter,
-                                        float *&pagerank_vector,
-                                        float *&residual);
-template int pagerankSolver<int, double>(raft::handle_t const &handle,
-                                         int n,
-                                         int e,
-                                         const int *cscPtr,
-                                         int const *cscInd,
-                                         double *cscVal,
-                                         int *prsVtx,
-                                         double *prsVal,
-                                         int prsLen,
-                                         bool has_personalization,
-                                         double alpha,
-                                         double *a,
-                                         bool has_guess,
-                                         float tolerance,
-                                         int max_iter,
-                                         double *&pagerank_vector,
-                                         double *&residual);
-
-template <typename VT, typename ET, typename WT>
-void pagerank_impl(raft::handle_t const &handle,
-                   GraphCSCView<VT, ET, WT> const &graph,
-                   WT *pagerank,
-                   VT personalization_subset_size = 0,
-                   VT *personalization_subset     = nullptr,
-                   WT *personalization_values     = nullptr,
-                   double alpha                   = 0.85,
-                   double tolerance               = 1e-5,
-                   int64_t max_iter               = 100,
-                   bool has_guess                 = false)
-{
-  bool has_personalization = false;
-  int prsLen               = 0;
-  VT m                     = graph.number_of_vertices;
-  ET nnz                   = graph.number_of_edges;
-  int status{0};
-  WT *d_pr{nullptr}, *d_val{nullptr}, *d_leaf_vector{nullptr};
-  WT res       = 1.0;
-  WT *residual = &res;
-
-  if (personalization_subset_size != 0) {
-    CUGRAPH_EXPECTS(personalization_subset != nullptr,
-                    "Invalid API parameter: personalization_subset array should be of size "
-                    "personalization_subset_size");
-    CUGRAPH_EXPECTS(personalization_values != nullptr,
-                    "Invalid API parameter: personalization_values array should be of size "
-                    "personalization_subset_size");
-    CUGRAPH_EXPECTS(personalization_subset_size <= m,
-                    "Personalization size should be smaller than V");
-    has_personalization = true;
-    prsLen              = static_cast<VT>(personalization_subset_size);
-  }
-
-#if 1 /* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
-  thrust::device_vector<WT> pr(m);
-  d_pr = pr.data().get();
-#else
-  rmm::device_vector<WT> pr(m);
-  d_pr = pr.data().get();
-#endif
-
-  rmm::device_vector<WT> leaf_vector(m);
-  rmm::device_vector<WT> val(nnz);
-
-  d_leaf_vector = leaf_vector.data().get();
-  d_val         = val.data().get();
-
-  //  The templating for HT_matrix_csc_coo assumes that m, nnz and data are all the same type
-  HT_matrix_csc_coo(m, nnz, graph.offsets, graph.indices, d_val, d_leaf_vector);
-
-  if (has_guess) { copy<WT>(m, (WT *)pagerank, d_pr); }
-
-  status = pagerankSolver<int32_t, WT>(handle,
-                                       m,
-                                       nnz,
-                                       graph.offsets,
-                                       graph.indices,
-                                       d_val,
-                                       personalization_subset,
-                                       personalization_values,
-                                       prsLen,
-                                       has_personalization,
-                                       alpha,
-                                       d_leaf_vector,
-                                       has_guess,
-                                       tolerance,
-                                       max_iter,
-                                       d_pr,
-                                       residual);
-
-  switch (status) {
-    case 0: break;
-    case -1: CUGRAPH_FAIL("Error : bad parameters in Pagerank");
-    case 1: break;  // Warning : Pagerank did not reached the desired tolerance
-    default: CUGRAPH_FAIL("Pagerank exec failed");
-  }
-
-  copy<WT>(m, d_pr, (WT *)pagerank);
-}
-}  // namespace detail
-
-template <typename VT, typename ET, typename WT>
-void pagerank(raft::handle_t const &handle,
-              GraphCSCView<VT, ET, WT> const &graph,
-              WT *pagerank,
-              VT personalization_subset_size,
-              VT *personalization_subset,
-              WT *personalization_values,
-              double alpha,
-              double tolerance,
-              int64_t max_iter,
-              bool has_guess)
-{
-  CUGRAPH_EXPECTS(pagerank != nullptr, "Invalid API parameter: Pagerank array should be of size V");
-  // Multi-GPU
-  if (handle.comms_initialized()) {
-    CUGRAPH_EXPECTS(has_guess == false,
-                    "Invalid API parameter: Multi-GPU Pagerank does not guess, please use the "
-                    "single GPU version for this feature");
-    CUGRAPH_EXPECTS(max_iter > 0, "The number of iteration must be positive");
-    cugraph::mg::pagerank<VT, ET, WT>(handle,
-                                      graph,
-                                      pagerank,
-                                      personalization_subset_size,
-                                      personalization_subset,
-                                      personalization_values,
-                                      alpha,
-                                      max_iter,
-                                      tolerance);
-  } else  // Single GPU
-    return detail::pagerank_impl<VT, ET, WT>(handle,
-                                             graph,
-                                             pagerank,
-                                             personalization_subset_size,
-                                             personalization_subset,
-                                             personalization_values,
-                                             alpha,
-                                             tolerance,
-                                             max_iter,
-                                             has_guess);
-}
-
-// explicit instantiation
-template void pagerank<int, int, float>(raft::handle_t const &handle,
-                                        GraphCSCView<int, int, float> const &graph,
-                                        float *pagerank,
-                                        int personalization_subset_size,
-                                        int *personalization_subset,
-                                        float *personalization_values,
-                                        double alpha,
-                                        double tolerance,
-                                        int64_t max_iter,
-                                        bool has_guess);
-template void pagerank<int, int, double>(raft::handle_t const &handle,
-                                         GraphCSCView<int, int, double> const &graph,
-                                         double *pagerank,
-                                         int personalization_subset_size,
-                                         int *personalization_subset,
-                                         double *personalization_values,
-                                         double alpha,
-                                         double tolerance,
-                                         int64_t max_iter,
-                                         bool has_guess);
-
-}  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank_1D.cu b/cpp/src/link_analysis/pagerank_1D.cu
deleted file mode 100644
index 3774a364cf1..00000000000
--- a/cpp/src/link_analysis/pagerank_1D.cu
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Alex Fender afender@nvidia.com
-
-#include <algorithm>
-#include <graph.hpp>
-#include "pagerank_1D.cuh"
-#include "utilities/graph_utils.cuh"
-
-namespace cugraph {
-namespace mg {
-
-template <typename VT, typename WT>
-__global__ void transition_kernel(const size_t e, const VT *ind, const VT *degree, WT *val)
-{
-  for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-    val[i] = 1.0 / degree[ind[i]];  // Degree contains IN degree. So all degree[ind[i]] were
-                                    // incremented by definition (no div by 0).
-}
-
-template <typename VT, typename ET, typename WT>
-Pagerank<VT, ET, WT>::Pagerank(const raft::handle_t &handle_, GraphCSCView<VT, ET, WT> const &G)
-  : comm(handle_.get_comms()),
-    bookmark(G.number_of_vertices),
-    prev_pr(G.number_of_vertices),
-    val(G.local_edges[comm.get_rank()]),
-    handle(handle_),
-    has_personalization(false)
-{
-  v_glob         = G.number_of_vertices;
-  v_loc          = G.local_vertices[comm.get_rank()];
-  e_loc          = G.local_edges[comm.get_rank()];
-  part_off       = G.local_offsets;
-  local_vertices = G.local_vertices;
-  off            = G.offsets;
-  ind            = G.indices;
-  blocks         = handle_.get_device_properties().maxGridSize[0];
-  threads        = handle_.get_device_properties().maxThreadsPerBlock;
-  sm_count       = handle_.get_device_properties().multiProcessorCount;
-
-  is_setup = false;
-}
-
-template <typename VT, typename ET, typename WT>
-Pagerank<VT, ET, WT>::~Pagerank()
-{
-}
-
-template <typename VT, typename ET, typename WT>
-void Pagerank<VT, ET, WT>::transition_vals(const VT *degree)
-{
-  if (e_loc > 0) {
-    int threads = std::min(e_loc, this->threads);
-    int blocks  = std::min(32 * sm_count, this->blocks);
-    transition_kernel<VT, WT><<<blocks, threads>>>(e_loc, ind, degree, val.data().get());
-    CHECK_CUDA(nullptr);
-  }
-}
-
-template <typename VT, typename ET, typename WT>
-void Pagerank<VT, ET, WT>::flag_leafs(const VT *degree)
-{
-  if (v_glob > 0) {
-    int threads = std::min(v_glob, this->threads);
-    int blocks  = std::min(32 * sm_count, this->blocks);
-    cugraph::detail::flag_leafs_kernel<VT, WT>
-      <<<blocks, threads>>>(v_glob, degree, bookmark.data().get());
-    CHECK_CUDA(nullptr);
-  }
-}
-
-// Artificially create the google matrix by setting val and bookmark
-template <typename VT, typename ET, typename WT>
-void Pagerank<VT, ET, WT>::setup(WT _alpha,
-                                 VT *degree,
-                                 VT personalization_subset_size,
-                                 VT *personalization_subset,
-                                 WT *personalization_values)
-{
-  if (!is_setup) {
-    alpha   = _alpha;
-    WT zero = 0.0;
-    WT one  = 1.0;
-    // Update dangling node vector
-    cugraph::detail::fill(v_glob, bookmark.data().get(), zero);
-    flag_leafs(degree);
-    cugraph::detail::update_dangling_nodes(v_glob, bookmark.data().get(), alpha);
-
-    // Transition matrix
-    transition_vals(degree);
-
-    // personalize
-    if (personalization_subset_size != 0) {
-      CUGRAPH_EXPECTS(personalization_subset != nullptr,
-                      "Invalid API parameter: personalization_subset array should be of size "
-                      "personalization_subset_size");
-      CUGRAPH_EXPECTS(personalization_values != nullptr,
-                      "Invalid API parameter: personalization_values array should be of size "
-                      "personalization_subset_size");
-      CUGRAPH_EXPECTS(personalization_subset_size <= v_glob,
-                      "Personalization size should be smaller than V");
-
-      WT sum = cugraph::detail::nrm1(personalization_subset_size, personalization_values);
-      if (sum != zero) {
-        has_personalization = true;
-        personalization_vector.resize(v_glob);
-        cugraph::detail::fill(v_glob, personalization_vector.data().get(), zero);
-        cugraph::detail::scal(v_glob, one / sum, personalization_values);
-        cugraph::detail::scatter(personalization_subset_size,
-                                 personalization_values,
-                                 personalization_vector.data().get(),
-                                 personalization_subset);
-      }
-    }
-    is_setup = true;
-  } else
-    CUGRAPH_FAIL("MG PageRank : Setup can be called only once");
-}
-
-// run the power iteration on the google matrix
-template <typename VT, typename ET, typename WT>
-int Pagerank<VT, ET, WT>::solve(int max_iter, float tolerance, WT *pagerank)
-{
-  if (is_setup) {
-    WT dot_res;
-    WT one = 1.0;
-    WT *pr = pagerank;
-    cugraph::detail::fill(v_glob, pagerank, one / v_glob);
-    cugraph::detail::fill(v_glob, prev_pr.data().get(), one / v_glob);
-    // This cuda sync was added to fix #426
-    // This should not be requiered in theory
-    // This is not needed on one GPU at this time
-    cudaDeviceSynchronize();
-    dot_res = cugraph::detail::dot(v_glob, bookmark.data().get(), pr);
-    MGcsrmv<VT, ET, WT> spmv_solver(
-      handle, local_vertices, part_off, off, ind, val.data().get(), pagerank);
-
-    WT residual;
-    int i;
-    for (i = 0; i < max_iter; ++i) {
-      spmv_solver.run(pagerank);
-      cugraph::detail::scal(v_glob, alpha, pr);
-
-      // personalization
-      if (has_personalization)
-        cugraph::detail::axpy(v_glob, dot_res, personalization_vector.data().get(), pr);
-      else
-        cugraph::detail::addv(v_glob, dot_res * (one / v_glob), pr);
-
-      dot_res = cugraph::detail::dot(v_glob, bookmark.data().get(), pr);
-      cugraph::detail::scal(v_glob, one / cugraph::detail::nrm2(v_glob, pr), pr);
-
-      // convergence check
-      cugraph::detail::axpy(v_glob, (WT)-1.0, pr, prev_pr.data().get());
-      residual = cugraph::detail::nrm2(v_glob, prev_pr.data().get());
-      if (residual < tolerance)
-        break;
-      else
-        cugraph::detail::copy(v_glob, pr, prev_pr.data().get());
-    }
-    cugraph::detail::scal(v_glob, one / cugraph::detail::nrm1(v_glob, pr), pr);
-    return i;
-  } else {
-    CUGRAPH_FAIL("MG PageRank : Solve was called before setup");
-  }
-}
-
-template class Pagerank<int, int, double>;
-template class Pagerank<int, int, float>;
-
-}  // namespace mg
-}  // namespace cugraph
-
-#include "utilities/eidir_graph_utils.hpp"
diff --git a/cpp/src/link_analysis/pagerank_1D.cuh b/cpp/src/link_analysis/pagerank_1D.cuh
deleted file mode 100644
index feb410daa9a..00000000000
--- a/cpp/src/link_analysis/pagerank_1D.cuh
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Alex Fender afender@nvidia.com
-
-#pragma once
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <numeric>
-#include <raft/handle.hpp>
-
-#include "utilities/error.hpp"
-#include "utilities/spmv_1D.cuh"
-
-namespace cugraph {
-namespace mg {
-
-template <typename VT, typename ET, typename WT>
-class Pagerank {
- private:
-  VT v_glob{};  // global number of vertices
-  VT v_loc{};   // local number of vertices
-  ET e_loc{};   // local number of edges
-  WT alpha{};   // damping factor
-  bool has_personalization;
-  // CUDA
-  const raft::comms::comms_t &comm;  // info about the mg comm setup
-  cudaStream_t stream;
-  int blocks;
-  int threads;
-  int sm_count;
-
-  // Vertex offsets for each partition.
-  VT *part_off;
-  VT *local_vertices;
-
-  // Google matrix
-  ET *off;
-  VT *ind;
-
-  rmm::device_vector<WT> val;                     // values of the substochastic matrix
-  rmm::device_vector<WT> bookmark;                // constant vector with dangling node info
-  rmm::device_vector<WT> prev_pr;                 // record the last pagerank for convergence check
-  rmm::device_vector<WT> personalization_vector;  // personalization vector after reconstruction
-
-  bool is_setup;
-  raft::handle_t const &handle;  // raft handle propagation for SpMV, etc.
-
- public:
-  Pagerank(const raft::handle_t &handle, const GraphCSCView<VT, ET, WT> &G);
-  ~Pagerank();
-
-  void transition_vals(const VT *degree);
-
-  void flag_leafs(const VT *degree);
-
-  // Artificially create the google matrix by setting val and bookmark
-  void setup(WT _alpha,
-             VT *degree,
-             VT personalization_subset_size,
-             VT *personalization_subset,
-             WT *personalization_values);
-
-  // run the power iteration on the google matrix, return the number of iterations
-  int solve(int max_iter, float tolerance, WT *pagerank);
-};
-
-template <typename VT, typename ET, typename WT>
-int pagerank(raft::handle_t const &handle,
-             const GraphCSCView<VT, ET, WT> &G,
-             WT *pagerank_result,
-             VT personalization_subset_size,
-             VT *personalization_subset,
-             WT *personalization_values,
-             const double damping_factor = 0.85,
-             const int64_t n_iter        = 100,
-             const double tolerance      = 1e-5)
-{
-  // null pointers check
-  CUGRAPH_EXPECTS(G.offsets != nullptr, "Invalid API parameter - offsets is null");
-  CUGRAPH_EXPECTS(G.indices != nullptr, "Invalid API parameter - indidices is null");
-  CUGRAPH_EXPECTS(pagerank_result != nullptr,
-                  "Invalid API parameter - pagerank output memory must be allocated");
-
-  // parameter values
-  CUGRAPH_EXPECTS(damping_factor > 0.0,
-                  "Invalid API parameter - invalid damping factor value (alpha<0)");
-  CUGRAPH_EXPECTS(damping_factor < 1.0,
-                  "Invalid API parameter - invalid damping factor value (alpha>1)");
-  CUGRAPH_EXPECTS(n_iter > 0, "Invalid API parameter - n_iter must be > 0");
-
-  rmm::device_vector<VT> degree(G.number_of_vertices);
-
-  // in-degree of CSC (equivalent to out-degree of original edge list)
-  G.degree(degree.data().get(), DegreeDirection::IN);
-
-  // Allocate and intialize Pagerank class
-  Pagerank<VT, ET, WT> pr_solver(handle, G);
-
-  // Set all constants info
-  pr_solver.setup(damping_factor,
-                  degree.data().get(),
-                  personalization_subset_size,
-                  personalization_subset,
-                  personalization_values);
-
-  // Run pagerank
-  return pr_solver.solve(n_iter, tolerance, pagerank_result);
-}
-
-}  // namespace mg
-}  // namespace cugraph
diff --git a/cpp/src/link_prediction/jaccard.cu b/cpp/src/link_prediction/jaccard.cu
index 70952974b39..25186a6492b 100644
--- a/cpp/src/link_prediction/jaccard.cu
+++ b/cpp/src/link_prediction/jaccard.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,12 @@
  * @file jaccard.cu
  * ---------------------------------------------------------------------------**/
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
-#include "graph.hpp"
-#include "utilities/graph_utils.cuh"
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
+
+#include <rmm/device_vector.hpp>
+#include <rmm/exec_policy.hpp>
 
 namespace cugraph {
 namespace detail {
@@ -30,7 +32,7 @@ namespace detail {
 // Volume of neighboors (*weight_s)
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void jaccard_row_sum(
-  vertex_t n, edge_t const *csrPtr, vertex_t const *csrInd, weight_t const *v, weight_t *work)
+  vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work)
 {
   vertex_t row;
   edge_t start, end, length;
@@ -54,12 +56,12 @@ __global__ void jaccard_row_sum(
 // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void jaccard_is(vertex_t n,
-                           edge_t const *csrPtr,
-                           vertex_t const *csrInd,
-                           weight_t const *v,
-                           weight_t *work,
-                           weight_t *weight_i,
-                           weight_t *weight_s)
+                           edge_t const* csrPtr,
+                           vertex_t const* csrInd,
+                           weight_t const* v,
+                           weight_t* work,
+                           weight_t* weight_i,
+                           weight_t* weight_s)
 {
   edge_t i, j, Ni, Nj;
   vertex_t row, col;
@@ -118,14 +120,14 @@ __global__ void jaccard_is(vertex_t n,
 // Using list of node pairs
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void jaccard_is_pairs(edge_t num_pairs,
-                                 edge_t const *csrPtr,
-                                 vertex_t const *csrInd,
-                                 vertex_t const *first_pair,
-                                 vertex_t const *second_pair,
-                                 weight_t const *v,
-                                 weight_t *work,
-                                 weight_t *weight_i,
-                                 weight_t *weight_s)
+                                 edge_t const* csrPtr,
+                                 vertex_t const* csrInd,
+                                 vertex_t const* first_pair,
+                                 vertex_t const* second_pair,
+                                 weight_t const* v,
+                                 weight_t* work,
+                                 weight_t* weight_i,
+                                 weight_t* weight_s)
 {
   edge_t i, idx, Ni, Nj, match;
   vertex_t row, col, ref, cur, ref_col, cur_col;
@@ -182,9 +184,9 @@ __global__ void jaccard_is_pairs(edge_t num_pairs,
 // Jaccard  weights (*weight)
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void jaccard_jw(edge_t e,
-                           weight_t const *weight_i,
-                           weight_t const *weight_s,
-                           weight_t *weight_j)
+                           weight_t const* weight_i,
+                           weight_t const* weight_s,
+                           weight_t* weight_j)
 {
   edge_t j;
   weight_t Wi, Ws, Wu;
@@ -200,14 +202,15 @@ __global__ void jaccard_jw(edge_t e,
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 int jaccard(vertex_t n,
             edge_t e,
-            edge_t const *csrPtr,
-            vertex_t const *csrInd,
-            weight_t const *weight_in,
-            weight_t *work,
-            weight_t *weight_i,
-            weight_t *weight_s,
-            weight_t *weight_j)
+            edge_t const* csrPtr,
+            vertex_t const* csrInd,
+            weight_t const* weight_in,
+            weight_t* work,
+            weight_t* weight_i,
+            weight_t* weight_s,
+            weight_t* weight_j)
 {
+  rmm::cuda_stream_view stream_view;
   dim3 nthreads, nblocks;
   int y = 4;
 
@@ -221,9 +224,9 @@ int jaccard(vertex_t n,
 
   // launch kernel
   jaccard_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-  fill(e, weight_i, weight_t{0.0});
+    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, csrPtr, csrInd, weight_in, work);
+
+  thrust::fill(rmm::exec_policy(stream_view), weight_i, weight_i + e, weight_t{0.0});
 
   // setup launch configuration
   nthreads.x = 32 / y;
@@ -234,8 +237,8 @@ int jaccard(vertex_t n,
   nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
 
   // launch kernel
-  jaccard_is<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work, weight_i, weight_s);
+  jaccard_is<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads, 0, stream_view.value()>>>(
+    n, csrPtr, csrInd, weight_in, work, weight_i, weight_s);
 
   // setup launch configuration
   nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS});
@@ -247,7 +250,7 @@ int jaccard(vertex_t n,
 
   // launch kernel
   jaccard_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(e, weight_i, weight_s, weight_j);
+    <<<nblocks, nthreads, 0, stream_view.value()>>>(e, weight_i, weight_s, weight_j);
 
   return 0;
 }
@@ -255,15 +258,15 @@ int jaccard(vertex_t n,
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 int jaccard_pairs(vertex_t n,
                   edge_t num_pairs,
-                  edge_t const *csrPtr,
-                  vertex_t const *csrInd,
-                  vertex_t const *first_pair,
-                  vertex_t const *second_pair,
-                  weight_t const *weight_in,
-                  weight_t *work,
-                  weight_t *weight_i,
-                  weight_t *weight_s,
-                  weight_t *weight_j)
+                  edge_t const* csrPtr,
+                  vertex_t const* csrInd,
+                  vertex_t const* first_pair,
+                  vertex_t const* second_pair,
+                  weight_t const* weight_in,
+                  weight_t* work,
+                  weight_t* weight_i,
+                  weight_t* weight_s,
+                  weight_t* weight_j)
 {
   dim3 nthreads, nblocks;
   int y = 4;
@@ -313,9 +316,9 @@ int jaccard_pairs(vertex_t n,
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-void jaccard(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *result)
+void jaccard(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
 
   rmm::device_vector<WT> weight_i(graph.number_of_edges);
   rmm::device_vector<WT> weight_s(graph.number_of_edges);
@@ -345,16 +348,16 @@ void jaccard(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *resul
 }
 
 template <typename VT, typename ET, typename WT>
-void jaccard_list(GraphCSRView<VT, ET, WT> const &graph,
-                  WT const *weights,
+void jaccard_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
+                  WT const* weights,
                   ET num_pairs,
-                  VT const *first,
-                  VT const *second,
-                  WT *result)
+                  VT const* first,
+                  VT const* second,
+                  WT* result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid API parameter: first is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid API parameter: second in NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
+  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first is NULL");
+  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second in NULL");
 
   rmm::device_vector<WT> weight_i(num_pairs, WT{0.0});
   rmm::device_vector<WT> weight_s(num_pairs);
@@ -387,41 +390,43 @@ void jaccard_list(GraphCSRView<VT, ET, WT> const &graph,
   }
 }
 
-template void jaccard<int32_t, int32_t, float>(GraphCSRView<int32_t, int32_t, float> const &,
-                                               float const *,
-                                               float *);
-template void jaccard<int32_t, int32_t, double>(GraphCSRView<int32_t, int32_t, double> const &,
-                                                double const *,
-                                                double *);
-template void jaccard<int64_t, int64_t, float>(GraphCSRView<int64_t, int64_t, float> const &,
-                                               float const *,
-                                               float *);
-template void jaccard<int64_t, int64_t, double>(GraphCSRView<int64_t, int64_t, double> const &,
-                                                double const *,
-                                                double *);
-template void jaccard_list<int32_t, int32_t, float>(GraphCSRView<int32_t, int32_t, float> const &,
-                                                    float const *,
-                                                    int32_t,
-                                                    int32_t const *,
-                                                    int32_t const *,
-                                                    float *);
-template void jaccard_list<int32_t, int32_t, double>(GraphCSRView<int32_t, int32_t, double> const &,
-                                                     double const *,
-                                                     int32_t,
-                                                     int32_t const *,
-                                                     int32_t const *,
-                                                     double *);
-template void jaccard_list<int64_t, int64_t, float>(GraphCSRView<int64_t, int64_t, float> const &,
-                                                    float const *,
-                                                    int64_t,
-                                                    int64_t const *,
-                                                    int64_t const *,
-                                                    float *);
-template void jaccard_list<int64_t, int64_t, double>(GraphCSRView<int64_t, int64_t, double> const &,
-                                                     double const *,
-                                                     int64_t,
-                                                     int64_t const *,
-                                                     int64_t const *,
-                                                     double *);
+template void jaccard<int32_t, int32_t, float>(legacy::GraphCSRView<int32_t, int32_t, float> const&,
+                                               float const*,
+                                               float*);
+template void jaccard<int32_t, int32_t, double>(
+  legacy::GraphCSRView<int32_t, int32_t, double> const&, double const*, double*);
+template void jaccard<int64_t, int64_t, float>(legacy::GraphCSRView<int64_t, int64_t, float> const&,
+                                               float const*,
+                                               float*);
+template void jaccard<int64_t, int64_t, double>(
+  legacy::GraphCSRView<int64_t, int64_t, double> const&, double const*, double*);
+template void jaccard_list<int32_t, int32_t, float>(
+  legacy::GraphCSRView<int32_t, int32_t, float> const&,
+  float const*,
+  int32_t,
+  int32_t const*,
+  int32_t const*,
+  float*);
+template void jaccard_list<int32_t, int32_t, double>(
+  legacy::GraphCSRView<int32_t, int32_t, double> const&,
+  double const*,
+  int32_t,
+  int32_t const*,
+  int32_t const*,
+  double*);
+template void jaccard_list<int64_t, int64_t, float>(
+  legacy::GraphCSRView<int64_t, int64_t, float> const&,
+  float const*,
+  int64_t,
+  int64_t const*,
+  int64_t const*,
+  float*);
+template void jaccard_list<int64_t, int64_t, double>(
+  legacy::GraphCSRView<int64_t, int64_t, double> const&,
+  double const*,
+  int64_t,
+  int64_t const*,
+  int64_t const*,
+  double*);
 
 }  // namespace cugraph
diff --git a/cpp/src/link_prediction/overlap.cu b/cpp/src/link_prediction/overlap.cu
index e3f80b50d9a..7b7470da7fc 100644
--- a/cpp/src/link_prediction/overlap.cu
+++ b/cpp/src/link_prediction/overlap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
  * ---------------------------------------------------------------------------**/
 
 #include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
-#include "graph.hpp"
-#include "utilities/graph_utils.cuh"
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
 
 namespace cugraph {
 namespace detail {
@@ -31,7 +31,7 @@ namespace detail {
 // TODO: Identical kernel to jaccard_row_sum!!
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void overlap_row_sum(
-  vertex_t n, edge_t const *csrPtr, vertex_t const *csrInd, weight_t const *v, weight_t *work)
+  vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work)
 {
   vertex_t row;
   edge_t start, end, length;
@@ -56,12 +56,12 @@ __global__ void overlap_row_sum(
 // TODO: Identical kernel to jaccard_row_sum!!
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void overlap_is(vertex_t n,
-                           edge_t const *csrPtr,
-                           vertex_t const *csrInd,
-                           weight_t const *v,
-                           weight_t *work,
-                           weight_t *weight_i,
-                           weight_t *weight_s)
+                           edge_t const* csrPtr,
+                           vertex_t const* csrInd,
+                           weight_t const* v,
+                           weight_t* work,
+                           weight_t* weight_i,
+                           weight_t* weight_s)
 {
   edge_t i, j, Ni, Nj;
   vertex_t row, col;
@@ -121,14 +121,14 @@ __global__ void overlap_is(vertex_t n,
 // NOTE:  NOT the same as jaccard
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void overlap_is_pairs(edge_t num_pairs,
-                                 edge_t const *csrPtr,
-                                 vertex_t const *csrInd,
-                                 vertex_t const *first_pair,
-                                 vertex_t const *second_pair,
-                                 weight_t const *v,
-                                 weight_t *work,
-                                 weight_t *weight_i,
-                                 weight_t *weight_s)
+                                 edge_t const* csrPtr,
+                                 vertex_t const* csrInd,
+                                 vertex_t const* first_pair,
+                                 vertex_t const* second_pair,
+                                 weight_t const* v,
+                                 weight_t* work,
+                                 weight_t* weight_i,
+                                 weight_t* weight_s)
 {
   edge_t i, idx, Ni, Nj, match;
   vertex_t row, col, ref, cur, ref_col, cur_col;
@@ -185,11 +185,11 @@ __global__ void overlap_is_pairs(edge_t num_pairs,
 // Overlap  weights (*weight)
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 __global__ void overlap_jw(edge_t e,
-                           edge_t const *csrPtr,
-                           vertex_t const *csrInd,
-                           weight_t *weight_i,
-                           weight_t *weight_s,
-                           weight_t *weight_j)
+                           edge_t const* csrPtr,
+                           vertex_t const* csrInd,
+                           weight_t* weight_i,
+                           weight_t* weight_s,
+                           weight_t* weight_j)
 {
   edge_t j;
   weight_t Wi, Wu;
@@ -204,13 +204,13 @@ __global__ void overlap_jw(edge_t e,
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 int overlap(vertex_t n,
             edge_t e,
-            edge_t const *csrPtr,
-            vertex_t const *csrInd,
-            weight_t const *weight_in,
-            weight_t *work,
-            weight_t *weight_i,
-            weight_t *weight_s,
-            weight_t *weight_j)
+            edge_t const* csrPtr,
+            vertex_t const* csrInd,
+            weight_t const* weight_in,
+            weight_t* work,
+            weight_t* weight_i,
+            weight_t* weight_s,
+            weight_t* weight_j)
 {
   dim3 nthreads, nblocks;
   int y = 4;
@@ -259,15 +259,15 @@ int overlap(vertex_t n,
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 int overlap_pairs(vertex_t n,
                   edge_t num_pairs,
-                  edge_t const *csrPtr,
-                  vertex_t const *csrInd,
-                  vertex_t const *first_pair,
-                  vertex_t const *second_pair,
-                  weight_t const *weight_in,
-                  weight_t *work,
-                  weight_t *weight_i,
-                  weight_t *weight_s,
-                  weight_t *weight_j)
+                  edge_t const* csrPtr,
+                  vertex_t const* csrInd,
+                  vertex_t const* first_pair,
+                  vertex_t const* second_pair,
+                  weight_t const* weight_in,
+                  weight_t* work,
+                  weight_t* weight_i,
+                  weight_t* weight_s,
+                  weight_t* weight_j)
 {
   dim3 nthreads, nblocks;
   int y = 4;
@@ -314,9 +314,9 @@ int overlap_pairs(vertex_t n,
 }  // namespace detail
 
 template <typename VT, typename ET, typename WT>
-void overlap(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *result)
+void overlap(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
 
   rmm::device_vector<WT> weight_i(graph.number_of_edges);
   rmm::device_vector<WT> weight_s(graph.number_of_edges);
@@ -346,16 +346,16 @@ void overlap(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *resul
 }
 
 template <typename VT, typename ET, typename WT>
-void overlap_list(GraphCSRView<VT, ET, WT> const &graph,
-                  WT const *weights,
+void overlap_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
+                  WT const* weights,
                   ET num_pairs,
-                  VT const *first,
-                  VT const *second,
-                  WT *result)
+                  VT const* first,
+                  VT const* second,
+                  WT* result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid API parameter: first column is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid API parameter: second column is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
+  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first column is NULL");
+  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second column is NULL");
 
   rmm::device_vector<WT> weight_i(num_pairs);
   rmm::device_vector<WT> weight_s(num_pairs);
@@ -388,41 +388,43 @@ void overlap_list(GraphCSRView<VT, ET, WT> const &graph,
   }
 }
 
-template void overlap<int32_t, int32_t, float>(GraphCSRView<int32_t, int32_t, float> const &,
-                                               float const *,
-                                               float *);
-template void overlap<int32_t, int32_t, double>(GraphCSRView<int32_t, int32_t, double> const &,
-                                                double const *,
-                                                double *);
-template void overlap<int64_t, int64_t, float>(GraphCSRView<int64_t, int64_t, float> const &,
-                                               float const *,
-                                               float *);
-template void overlap<int64_t, int64_t, double>(GraphCSRView<int64_t, int64_t, double> const &,
-                                                double const *,
-                                                double *);
-template void overlap_list<int32_t, int32_t, float>(GraphCSRView<int32_t, int32_t, float> const &,
-                                                    float const *,
-                                                    int32_t,
-                                                    int32_t const *,
-                                                    int32_t const *,
-                                                    float *);
-template void overlap_list<int32_t, int32_t, double>(GraphCSRView<int32_t, int32_t, double> const &,
-                                                     double const *,
-                                                     int32_t,
-                                                     int32_t const *,
-                                                     int32_t const *,
-                                                     double *);
-template void overlap_list<int64_t, int64_t, float>(GraphCSRView<int64_t, int64_t, float> const &,
-                                                    float const *,
-                                                    int64_t,
-                                                    int64_t const *,
-                                                    int64_t const *,
-                                                    float *);
-template void overlap_list<int64_t, int64_t, double>(GraphCSRView<int64_t, int64_t, double> const &,
-                                                     double const *,
-                                                     int64_t,
-                                                     int64_t const *,
-                                                     int64_t const *,
-                                                     double *);
+template void overlap<int32_t, int32_t, float>(legacy::GraphCSRView<int32_t, int32_t, float> const&,
+                                               float const*,
+                                               float*);
+template void overlap<int32_t, int32_t, double>(
+  legacy::GraphCSRView<int32_t, int32_t, double> const&, double const*, double*);
+template void overlap<int64_t, int64_t, float>(legacy::GraphCSRView<int64_t, int64_t, float> const&,
+                                               float const*,
+                                               float*);
+template void overlap<int64_t, int64_t, double>(
+  legacy::GraphCSRView<int64_t, int64_t, double> const&, double const*, double*);
+template void overlap_list<int32_t, int32_t, float>(
+  legacy::GraphCSRView<int32_t, int32_t, float> const&,
+  float const*,
+  int32_t,
+  int32_t const*,
+  int32_t const*,
+  float*);
+template void overlap_list<int32_t, int32_t, double>(
+  legacy::GraphCSRView<int32_t, int32_t, double> const&,
+  double const*,
+  int32_t,
+  int32_t const*,
+  int32_t const*,
+  double*);
+template void overlap_list<int64_t, int64_t, float>(
+  legacy::GraphCSRView<int64_t, int64_t, float> const&,
+  float const*,
+  int64_t,
+  int64_t const*,
+  int64_t const*,
+  float*);
+template void overlap_list<int64_t, int64_t, double>(
+  legacy::GraphCSRView<int64_t, int64_t, double> const&,
+  double const*,
+  int64_t,
+  int64_t const*,
+  int64_t const*,
+  double*);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cu b/cpp/src/sampling/random_walks.cu
new file mode 100644
index 00000000000..1883535bf70
--- /dev/null
+++ b/cpp/src/sampling/random_walks.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#include <cugraph/algorithms.hpp>
+#include "random_walks.cuh"
+
+namespace cugraph {
+namespace experimental {
+// template explicit instantiation directives (EIDir's):
+//
+// SG FP32{
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>, rmm::device_uvector<int32_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int32_t, float, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int32_t num_paths,
+               int32_t max_depth,
+               bool use_padding);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int64_t, float, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth,
+               bool use_padding);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int64_t, int64_t, float, false, false> const& gview,
+               int64_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth,
+               bool use_padding);
+//}
+//
+// SG FP64{
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>, rmm::device_uvector<int32_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int32_t, double, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int32_t num_paths,
+               int32_t max_depth,
+               bool use_padding);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int64_t, double, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth,
+               bool use_padding);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int64_t, int64_t, double, false, false> const& gview,
+               int64_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth,
+               bool use_padding);
+//}
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       int32_t coalesced_sz_v,
+                       int32_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int64_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       int64_t coalesced_sz_v,
+                       int64_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       int64_t coalesced_sz_v,
+                       int64_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>>
+query_rw_sizes_offsets(raft::handle_t const& handle, int32_t num_paths, int32_t const* ptr_d_sizes);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>>
+query_rw_sizes_offsets(raft::handle_t const& handle, int64_t num_paths, int64_t const* ptr_d_sizes);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
new file mode 100644
index 00000000000..2286fa28697
--- /dev/null
+++ b/cpp/src/sampling/random_walks.cuh
@@ -0,0 +1,1251 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#pragma once
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/experimental/graph.hpp>
+
+#include <utilities/graph_utils.cuh>
+
+#include <raft/device_atomics.cuh>
+#include <raft/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/fill.h>
+#include <thrust/find.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/optional.h>
+#include <thrust/remove.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
+#include <thrust/tuple.h>
+
+#include <cassert>
+#include <cstdlib>  // FIXME: requirement for temporary std::getenv()
+#include <ctime>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+
+#include "rw_traversals.hpp"
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// raft random generator:
+// (using upper-bound cached "map"
+//  giving out_deg(v) for each v in [0, |V|);
+//  and a pre-generated vector of float random values
+//  in [0,1] to be brought into [0, d_ub[v]))
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename seed_t  = uint64_t,
+          typename real_t  = float,
+          typename index_t = edge_t>
+struct rrandom_gen_t {
+  using seed_type = seed_t;
+  using real_type = real_t;
+
+  // cnstr. version that provides step-wise in-place
+  // rnd generation:
+  //
+  rrandom_gen_t(raft::handle_t const& handle,
+                index_t num_paths,
+                device_vec_t<real_t>& d_random,             // scratch-pad, non-coalesced
+                device_vec_t<edge_t> const& d_crt_out_deg,  // non-coalesced
+                seed_t seed = seed_t{})
+    : handle_(handle),
+      seed_(seed),
+      num_paths_(num_paths),
+      d_ptr_out_degs_(raw_const_ptr(d_crt_out_deg)),
+      d_ptr_random_(raw_ptr(d_random))
+  {
+    auto rnd_sz = d_random.size();
+
+    CUGRAPH_EXPECTS(rnd_sz >= static_cast<decltype(rnd_sz)>(num_paths),
+                    "Un-allocated random buffer.");
+
+    // done in constructor;
+    // this must be done at each step,
+    // but this object is constructed at each step;
+    //
+    generate_random(handle, d_ptr_random_, num_paths, seed_);
+  }
+
+  // cnstr. version for the case when the
+  // random vector is provided by the caller:
+  //
+  rrandom_gen_t(raft::handle_t const& handle,
+                index_t num_paths,
+                real_t* ptr_d_rnd,                          // supplied
+                device_vec_t<edge_t> const& d_crt_out_deg,  // non-coalesced
+                seed_t seed = seed_t{})
+    : handle_(handle),
+      seed_(seed),
+      num_paths_(num_paths),
+      d_ptr_out_degs_(raw_const_ptr(d_crt_out_deg)),
+      d_ptr_random_(ptr_d_rnd)
+  {
+  }
+
+  // in place:
+  // for each v in [0, num_paths) {
+  // if out_deg(v) > 0
+  //   d_col_indx[v] = random index in [0, out_deg(v))
+  //}
+  void generate_col_indices(device_vec_t<vertex_t>& d_col_indx) const
+  {
+    thrust::transform_if(
+      rmm::exec_policy(handle_.get_stream_view()),
+      d_ptr_random_,
+      d_ptr_random_ + num_paths_,  // input1
+      d_ptr_out_degs_,             // input2
+      d_ptr_out_degs_,             // also stencil
+      d_col_indx.begin(),
+      [] __device__(real_t rnd_vindx, edge_t crt_out_deg) {
+        real_t max_ub     = static_cast<real_t>(crt_out_deg - 1);
+        auto interp_vindx = rnd_vindx * max_ub + real_t{.5};
+        vertex_t v_indx   = static_cast<vertex_t>(interp_vindx);
+        return (v_indx >= crt_out_deg ? crt_out_deg - 1 : v_indx);
+      },
+      [] __device__(auto crt_out_deg) { return crt_out_deg > 0; });
+  }
+
+  // abstracts away the random values generation:
+  //
+  static void generate_random(raft::handle_t const& handle, real_t* p_d_rnd, size_t sz, seed_t seed)
+  {
+    cugraph::detail::uniform_random_fill(
+      handle.get_stream_view(), p_d_rnd, sz, real_t{0.0}, real_t{1.0}, seed);
+  }
+
+ private:
+  raft::handle_t const& handle_;
+  index_t num_paths_;
+  edge_t const* d_ptr_out_degs_;  // device buffer with out-deg of current set of vertices (most
+                                  // recent vertex in each path); size = num_paths_
+  real_t* d_ptr_random_;          // device buffer with real random values; size = num_paths_
+  seed_t seed_;                   // seed to be used for current batch
+};
+
+// seeding policy: time (clock) dependent,
+// to avoid RW calls repeating same random data:
+//
+template <typename seed_t>
+struct clock_seeding_t {
+  clock_seeding_t(void) = default;
+
+  seed_t operator()(void) { return static_cast<seed_t>(std::time(nullptr)); }
+};
+
+// seeding policy: fixed for debug/testing repro
+//
+template <typename seed_t>
+struct fixed_seeding_t {
+  // purposely no default cnstr.
+
+  fixed_seeding_t(seed_t seed) : seed_(seed) {}
+  seed_t operator()(void) { return seed_; }
+
+ private:
+  seed_t seed_;
+};
+
+// classes abstracting the next vertex extraction mechanism:
+//
+// primary template, purposely undefined
+template <typename graph_t,
+          typename index_t  = typename graph_t::edge_type,
+          typename enable_t = void>
+struct col_indx_extract_t;
+
+// specialization for single-gpu functionality:
+//
+template <typename graph_t, typename index_t>
+struct col_indx_extract_t<graph_t, index_t, std::enable_if_t<graph_t::is_multi_gpu == false>> {
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  col_indx_extract_t(raft::handle_t const& handle,
+                     graph_t const& graph,
+                     edge_t const* p_d_crt_out_degs,
+                     index_t const* p_d_sizes,
+                     index_t num_paths,
+                     index_t max_depth)
+    : handle_(handle),
+      col_indices_(graph.get_matrix_partition_view().get_indices()),
+      row_offsets_(graph.get_matrix_partition_view().get_offsets()),
+      values_(graph.get_matrix_partition_view().get_weights()),
+      out_degs_(p_d_crt_out_degs),
+      sizes_(p_d_sizes),
+      num_paths_(num_paths),
+      max_depth_(max_depth)
+  {
+  }
+
+  // in-place extractor of next set of vertices and weights,
+  // (d_v_next_vertices, d_v_next_weights),
+  // given start set of vertices. d_v_src_vertices,
+  // and corresponding column index set, d_v_col_indx:
+  //
+  // for each indx in [0, num_paths){
+  //   v_indx = d_v_src_vertices[indx*max_depth + d_sizes[indx] - 1];
+  //   if( out_degs_[v_indx] > 0 ) {
+  //      start_row = row_offsets_[v_indx];
+  //      delta = d_v_col_indx[indx];
+  //      d_v_next_vertices[indx] = col_indices_[start_row + delta];
+  // }
+  // (use tranform_if() with transform iterator)
+  //
+  void operator()(
+    device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
+    device_vec_t<vertex_t> const&
+      d_v_col_indx,  // in: column indices, given by stepper's random engine
+    device_vec_t<vertex_t>& d_v_next_vertices,  // out: set of destination vertices, for next step
+    device_vec_t<weight_t>&
+      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
+    const
+  {
+    thrust::transform_if(
+      rmm::exec_policy(handle_.get_stream_view()),
+      thrust::make_counting_iterator<index_t>(0),
+      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
+      d_v_col_indx.begin(),                                 // input2
+      out_degs_,                                            // stencil
+      thrust::make_zip_iterator(
+        thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
+      [max_depth         = max_depth_,
+       ptr_d_sizes       = sizes_,
+       ptr_d_coalesced_v = raw_const_ptr(d_coalesced_src_v),
+       row_offsets       = row_offsets_,
+       col_indices       = col_indices_,
+       values            = values_ ? thrust::optional<weight_t const*>{*values_}
+                                   : thrust::nullopt] __device__(auto indx, auto col_indx) {
+        auto delta     = ptr_d_sizes[indx] - 1;
+        auto v_indx    = ptr_d_coalesced_v[indx * max_depth + delta];
+        auto start_row = row_offsets[v_indx];
+
+        auto weight_value = (values ? (*values)[start_row + col_indx]
+                                    : weight_t{1});  // account for un-weighted graphs
+        return thrust::make_tuple(col_indices[start_row + col_indx], weight_value);
+      },
+      [] __device__(auto crt_out_deg) { return crt_out_deg > 0; });
+  }
+
+ private:
+  raft::handle_t const& handle_;
+  vertex_t const* col_indices_;
+  edge_t const* row_offsets_;
+  std::optional<weight_t const*> values_;
+
+  edge_t const* out_degs_;
+  index_t const* sizes_;
+  index_t num_paths_;
+  index_t max_depth_;
+};
+
+/**
+ * @brief Class abstracting the RW initialization, stepping, and stopping functionality
+ *        The outline of the algorithm is as follows:
+ *
+ *        (1) vertex sets are coalesced into d_coalesced_v,
+ *            weight sets are coalesced into d_coalesced_w;
+ *            i.e., the 2 coalesced vectors are allocated to
+ *            num_paths * max_depth, and num_paths * (max_depth -1), respectively
+ *            (since each path has a number of edges equal one
+ *             less than the number of vertices);
+ *            d_coalesced_v is initialized for each i*max_depth entry
+ *            (i=0,,,,num_paths-1) to the corresponding starting vertices;
+ *        (2) d_sizes maintains the current size is for each path;
+ *            Note that a path may end prematurely if it reaches a sink vertex;
+ *        (3) d_crt_out_degs maintains the out-degree of each of the latest
+ *            vertices in the path; i.e., if N(v) := set of destination
+ *            vertices from v, then this vector stores |N(v)|
+ *            for last v in each path; i.e.,
+ *            d_crt_out_degs[i] =
+ *              out-degree( d_coalesced_v[i*max_depth + d_sizes[i]-1] ),
+ *            for i in {0,..., num_paths-1};
+ *        (4) a set of num_paths floating point numbers between [0,1]
+ *            are generated at each step; then they get translated into
+ *            _indices_ k in {0,...d_crt_out_degs[i]-1};
+ *        (5) the next vertex v is then picked as the k-th out-neighbor:
+ *            next(v) = N(v)[k];
+ *        (6) d_sizes are incremented accordingly; i.e., for those paths whose
+ *            corresponding last vertex has out-degree > 0;
+ *        (7) then next(v) and corresponding weight of (v, next(v)) are stored
+ *            at appropriate location in their corresponding coalesced vectors;
+ *        (8) the client of this class (the random_walks() function) then repeats
+ *            this process max_depth times or until all paths
+ *            have reached sinks; i.e., d_crt_out_degs = {0, 0,...,0},
+ *            whichever comes first;
+ *        (9) in the end some post-processing is done (stop()) to remove
+ *            unused entries from the 2 coalesced vectors;
+ *        (10) the triplet made of the 2 coalesced vectors and d_sizes is then returned;
+ *
+ */
+template <typename graph_t,
+          typename random_engine_t =
+            rrandom_gen_t<typename graph_t::vertex_type, typename graph_t::edge_type>,
+          typename index_t = typename graph_t::edge_type>
+struct random_walker_t {
+  using vertex_t     = typename graph_t::vertex_type;
+  using edge_t       = typename graph_t::edge_type;
+  using weight_t     = typename graph_t::weight_type;
+  using seed_t       = typename random_engine_t::seed_type;
+  using real_t       = typename random_engine_t::real_type;
+  using rnd_engine_t = random_engine_t;
+
+  random_walker_t(raft::handle_t const& handle,
+                  graph_t const& graph,
+                  index_t num_paths,
+                  index_t max_depth,
+                  vertex_t v_padding_val = 0,
+                  weight_t w_padding_val = 0)
+    : handle_(handle),
+      num_paths_(num_paths),
+      max_depth_(max_depth),
+      d_cached_out_degs_(graph.compute_out_degrees(handle_)),
+      vertex_padding_value_(v_padding_val != 0 ? v_padding_val : graph.get_number_of_vertices()),
+      weight_padding_value_(w_padding_val)
+  {
+  }
+
+  // for each i in [0..num_paths_) {
+  //   d_paths_v_set[i*max_depth] = d_src_init_v[i];
+  //
+  void start(device_const_vector_view<vertex_t, index_t>& d_src_init_v,  // in: start set
+             device_vec_t<vertex_t>& d_paths_v_set,                      // out: coalesced v
+             device_vec_t<index_t>& d_sizes) const  // out: init sizes to {1,...}
+  {
+    // intialize path sizes to 1, as they contain at least one vertex each:
+    // the initial set: d_src_init_v;
+    //
+    thrust::copy_n(rmm::exec_policy(handle_.get_stream_view()),
+                   thrust::make_constant_iterator<index_t>(1),
+                   num_paths_,
+                   d_sizes.begin());
+
+    // scatter d_src_init_v to coalesced vertex vector:
+    //
+    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+
+    // use the transform iterator as map:
+    //
+    auto map_it_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<index_t>(0), dlambda);
+
+    thrust::scatter(rmm::exec_policy(handle_.get_stream_view()),
+                    d_src_init_v.begin(),
+                    d_src_init_v.end(),
+                    map_it_begin,
+                    d_paths_v_set.begin());
+  }
+
+  // overload for start() with device_uvector d_v_start
+  // (handy for testing)
+  //
+  void start(device_vec_t<vertex_t> const& d_start,  // in: start set
+             device_vec_t<vertex_t>& d_paths_v_set,  // out: coalesced v
+             device_vec_t<index_t>& d_sizes) const   // out: init sizes to {1,...}
+  {
+    device_const_vector_view<vertex_t, index_t> d_start_cview{d_start.data(),
+                                                              static_cast<index_t>(d_start.size())};
+
+    start(d_start_cview, d_paths_v_set, d_sizes);
+  }
+
+  // in-place updates its arguments from one step to next
+  // (to avoid copying); all "crt" arguments are updated at each step()
+  // and passed as scratchpad space to avoid copying them
+  // from one step to another
+  //
+  // take one step in sync for all paths that have not reached sinks:
+  //
+  void step(
+    graph_t const& graph,
+    seed_t seed,
+    device_vec_t<vertex_t>& d_coalesced_v,  // crt coalesced vertex set
+    device_vec_t<weight_t>& d_coalesced_w,  // crt coalesced weight set
+    device_vec_t<index_t>& d_paths_sz,      // crt paths sizes
+    device_vec_t<edge_t>& d_crt_out_degs,   // crt out-degs for current set of vertices
+    device_vec_t<real_t>& d_random,         // crt set of random real values
+    device_vec_t<vertex_t>& d_col_indx,  // crt col col indices to be used for retrieving next step
+    device_vec_t<vertex_t>& d_next_v,    // crt set of destination vertices, for next step
+    device_vec_t<weight_t>& d_next_w)
+    const  // set of weights between src and destination vertices, for next step
+  {
+    // update crt snapshot of out-degs,
+    // from cached out degs, using
+    // latest vertex in each path as source:
+    //
+    gather_from_coalesced(
+      d_coalesced_v, d_cached_out_degs_, d_paths_sz, d_crt_out_degs, max_depth_, num_paths_);
+
+    // generate random destination indices:
+    //
+    random_engine_t rgen(handle_, num_paths_, d_random, d_crt_out_degs, seed);
+
+    rgen.generate_col_indices(d_col_indx);
+
+    // dst extraction from dst indices:
+    //
+    col_indx_extract_t<graph_t> col_extractor(handle_,
+                                              graph,
+                                              raw_const_ptr(d_crt_out_degs),
+                                              raw_const_ptr(d_paths_sz),
+                                              num_paths_,
+                                              max_depth_);
+
+    // The following steps update the next entry in each path,
+    // except the paths that reached sinks;
+    //
+    // for each indx in [0..num_paths) {
+    //   v_indx = d_v_rnd_n_indx[indx];
+    //
+    //   -- get the `v_indx`-th out-vertex of d_v_paths_v_set[indx] vertex:
+    //   -- also, note the size deltas increased by 1 in dst (d_sizes[]):
+    //
+    //   d_coalesced_v[indx*num_paths + d_sizes[indx]] =
+    //       get_out_vertex(graph, d_coalesced_v[indx*num_paths + d_sizes[indx] -1)], v_indx);
+    //   d_coalesced_w[indx*(num_paths-1) + d_sizes[indx] - 1] =
+    //       get_out_edge_weight(graph, d_coalesced_v[indx*num_paths + d_sizes[indx]-2], v_indx);
+    //
+    // (1) generate actual vertex destinations:
+    //
+    col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+    // (2) update path sizes:
+    //
+    update_path_sizes(d_crt_out_degs, d_paths_sz);
+
+    // (3) actual coalesced updates:
+    //
+    scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_paths_sz);
+    scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_paths_sz);
+  }
+
+  // step() version that doesn't update the random vector:
+  // (the caller supplies it)
+  //
+  void step_only(
+    graph_t const& graph,
+    device_vec_t<vertex_t>& d_coalesced_v,  // crt coalesced vertex set
+    device_vec_t<weight_t>& d_coalesced_w,  // crt coalesced weight set
+    device_vec_t<index_t>& d_paths_sz,      // crt paths sizes
+    device_vec_t<edge_t>& d_crt_out_degs,   // crt out-degs for current set of vertices
+    real_t* ptr_d_random,                   // crt set of random real values (supplied)
+    device_vec_t<vertex_t>& d_col_indx,  // crt col col indices to be used for retrieving next step
+    device_vec_t<vertex_t>& d_next_v,    // crt set of destination vertices, for next step
+    device_vec_t<weight_t>& d_next_w)
+    const  // set of weights between src and destination vertices, for next step
+  {
+    // update crt snapshot of out-degs,
+    // from cached out degs, using
+    // latest vertex in each path as source:
+    //
+    gather_from_coalesced(
+      d_coalesced_v, d_cached_out_degs_, d_paths_sz, d_crt_out_degs, max_depth_, num_paths_);
+
+    // generate random destination indices:
+    //
+    random_engine_t rgen(handle_, num_paths_, ptr_d_random, d_crt_out_degs);
+
+    rgen.generate_col_indices(d_col_indx);
+
+    // dst extraction from dst indices:
+    //
+    col_indx_extract_t<graph_t> col_extractor(handle_,
+                                              graph,
+                                              raw_const_ptr(d_crt_out_degs),
+                                              raw_const_ptr(d_paths_sz),
+                                              num_paths_,
+                                              max_depth_);
+
+    // The following steps update the next entry in each path,
+    // except the paths that reached sinks;
+    //
+    // for each indx in [0..num_paths) {
+    //   v_indx = d_v_rnd_n_indx[indx];
+    //
+    //   -- get the `v_indx`-th out-vertex of d_v_paths_v_set[indx] vertex:
+    //   -- also, note the size deltas increased by 1 in dst (d_sizes[]):
+    //
+    //   d_coalesced_v[indx*num_paths + d_sizes[indx]] =
+    //       get_out_vertex(graph, d_coalesced_v[indx*num_paths + d_sizes[indx] -1)], v_indx);
+    //   d_coalesced_w[indx*(num_paths-1) + d_sizes[indx] - 1] =
+    //       get_out_edge_weight(graph, d_coalesced_v[indx*num_paths + d_sizes[indx]-2], v_indx);
+    //
+    // (1) generate actual vertex destinations:
+    //
+    col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+    // (2) update path sizes:
+    //
+    update_path_sizes(d_crt_out_degs, d_paths_sz);
+
+    // (3) actual coalesced updates:
+    //
+    scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_paths_sz);
+    scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_paths_sz);
+  }
+
+  // returns true if all paths reached sinks:
+  //
+  bool all_paths_stopped(device_vec_t<edge_t> const& d_crt_out_degs) const
+  {
+    auto how_many_stopped =
+      thrust::count_if(rmm::exec_policy(handle_.get_stream_view()),
+                       d_crt_out_degs.begin(),
+                       d_crt_out_degs.end(),
+                       [] __device__(auto crt_out_deg) { return crt_out_deg == 0; });
+    return (static_cast<size_t>(how_many_stopped) == d_crt_out_degs.size());
+  }
+
+  // wrap-up, post-process:
+  // truncate v_set, w_set to actual space used
+  //
+  void stop(device_vec_t<vertex_t>& d_coalesced_v,       // coalesced vertex set
+            device_vec_t<weight_t>& d_coalesced_w,       // coalesced weight set
+            device_vec_t<index_t> const& d_sizes) const  // paths sizes
+  {
+    assert(max_depth_ > 1);  // else, no need to step; and no edges
+
+    index_t const* ptr_d_sizes = d_sizes.data();
+
+    auto predicate_v = [max_depth = max_depth_, ptr_d_sizes] __device__(auto indx) {
+      auto row_indx = indx / max_depth;
+      auto col_indx = indx % max_depth;
+
+      return (col_indx >= ptr_d_sizes[row_indx]);
+    };
+
+    auto predicate_w = [max_depth = max_depth_, ptr_d_sizes] __device__(auto indx) {
+      auto row_indx = indx / (max_depth - 1);
+      auto col_indx = indx % (max_depth - 1);
+
+      return (col_indx >= ptr_d_sizes[row_indx] - 1);
+    };
+
+    auto new_end_v = thrust::remove_if(rmm::exec_policy(handle_.get_stream_view()),
+                                       d_coalesced_v.begin(),
+                                       d_coalesced_v.end(),
+                                       thrust::make_counting_iterator<index_t>(0),
+                                       predicate_v);
+
+    auto new_end_w = thrust::remove_if(rmm::exec_policy(handle_.get_stream_view()),
+                                       d_coalesced_w.begin(),
+                                       d_coalesced_w.end(),
+                                       thrust::make_counting_iterator<index_t>(0),
+                                       predicate_w);
+
+    handle_.get_stream_view().synchronize();
+
+    d_coalesced_v.resize(thrust::distance(d_coalesced_v.begin(), new_end_v), handle_.get_stream());
+    d_coalesced_w.resize(thrust::distance(d_coalesced_w.begin(), new_end_w), handle_.get_stream());
+  }
+
+  // in-place non-static (needs handle_):
+  // for indx in [0, nelems):
+  //   gather d_result[indx] = d_src[d_coalesced[indx*stride + d_sizes[indx] -1]]
+  //
+  template <typename src_vec_t = vertex_t>
+  void gather_from_coalesced(
+    device_vec_t<vertex_t> const& d_coalesced,  // |gather map| = stride*nelems
+    device_vec_t<src_vec_t> const& d_src,       // |gather input| = nelems
+    device_vec_t<index_t> const& d_sizes,       // |paths sizes| = nelems, elems in [1, stride]
+    device_vec_t<src_vec_t>& d_result,          // |output| = nelems
+    index_t stride,        // stride = coalesce block size (typically max_depth)
+    index_t nelems) const  // nelems = number of elements to gather (typically num_paths_)
+  {
+    vertex_t const* ptr_d_coalesced = raw_const_ptr(d_coalesced);
+    index_t const* ptr_d_sizes      = raw_const_ptr(d_sizes);
+
+    // delta = ptr_d_sizes[indx] - 1
+    //
+    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+      auto delta = ptr_d_sizes[indx] - 1;
+      return ptr_d_coalesced[indx * stride + delta];
+    };
+
+    // use the transform iterator as map:
+    //
+    auto map_it_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<index_t>(0), dlambda);
+
+    thrust::gather(rmm::exec_policy(handle_.get_stream_view()),
+                   map_it_begin,
+                   map_it_begin + nelems,
+                   d_src.begin(),
+                   d_result.begin());
+  }
+
+  // in-place non-static (needs handle_);
+  // pre-condition: path sizes are assumed updated
+  // to reflect new vertex additions;
+  //
+  // for indx in [0, nelems):
+  //   if ( d_crt_out_degs[indx] > 0 )
+  //     d_coalesced[indx*stride + (d_sizes[indx] - adjust)- 1] = d_src[indx]
+  //
+  // adjust := 0 for coalesced vertices; 1 for weights
+  // (because |edges| = |vertices| - 1, in each path);
+  //
+  template <typename src_vec_t>
+  void scatter_to_coalesced(
+    device_vec_t<src_vec_t> const& d_src,        // |scatter input| = nelems
+    device_vec_t<src_vec_t>& d_coalesced,        // |scatter input| = stride*nelems
+    device_vec_t<edge_t> const& d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
+                                                 // to be used as stencil (don't scatter if 0)
+    device_vec_t<index_t> const&
+      d_sizes,  // paths sizes used to provide delta in coalesced paths;
+                // pre-condition: assumed as updated to reflect new vertex additions;
+                // also, this is the number of _vertices_ in each path;
+    // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
+    index_t
+      stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
+    index_t nelems,  // nelems = number of elements to gather (typically num_paths_)
+    index_t adjust = 0)
+    const  // adjusting parameter for scattering vertices (0) or weights (1); see above for more;
+  {
+    index_t const* ptr_d_sizes = raw_const_ptr(d_sizes);
+
+    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
+      auto delta = ptr_d_sizes[indx] - adjust - 1;
+      return indx * stride + delta;
+    };
+
+    // use the transform iterator as map:
+    //
+    auto map_it_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<index_t>(0), dlambda);
+
+    thrust::scatter_if(rmm::exec_policy(handle_.get_stream_view()),
+                       d_src.begin(),
+                       d_src.end(),
+                       map_it_begin,
+                       d_crt_out_degs.begin(),
+                       d_coalesced.begin(),
+                       [] __device__(auto crt_out_deg) {
+                         return crt_out_deg > 0;  // predicate
+                       });
+  }
+
+  // updates the entries in the corresponding coalesced vector,
+  // for which out_deg > 0
+  //
+  void scatter_vertices(device_vec_t<vertex_t> const& d_src,
+                        device_vec_t<vertex_t>& d_coalesced,
+                        device_vec_t<edge_t> const& d_crt_out_degs,
+                        device_vec_t<index_t> const& d_sizes) const
+  {
+    scatter_to_coalesced(d_src, d_coalesced, d_crt_out_degs, d_sizes, max_depth_, num_paths_);
+  }
+  //
+  void scatter_weights(device_vec_t<weight_t> const& d_src,
+                       device_vec_t<weight_t>& d_coalesced,
+                       device_vec_t<edge_t> const& d_crt_out_degs,
+                       device_vec_t<index_t> const& d_sizes) const
+  {
+    scatter_to_coalesced(
+      d_src, d_coalesced, d_crt_out_degs, d_sizes, max_depth_ - 1, num_paths_, 1);
+  }
+
+  // in-place update (increment) path sizes for paths
+  // that have not reached a sink; i.e., for which
+  // d_crt_out_degs[indx]>0:
+  //
+  void update_path_sizes(device_vec_t<edge_t> const& d_crt_out_degs,
+                         device_vec_t<index_t>& d_sizes) const
+  {
+    thrust::transform_if(
+      rmm::exec_policy(handle_.get_stream_view()),
+      d_sizes.begin(),
+      d_sizes.end(),           // input
+      d_crt_out_degs.begin(),  // stencil
+      d_sizes.begin(),         // output: in-place
+      [] __device__(auto crt_sz) { return crt_sz + 1; },
+      [] __device__(auto crt_out_deg) { return crt_out_deg > 0; });
+  }
+
+  device_vec_t<edge_t> const& get_out_degs(void) const { return d_cached_out_degs_; }
+
+  vertex_t get_vertex_padding_value(void) const { return vertex_padding_value_; }
+
+  weight_t get_weight_padding_value(void) const { return weight_padding_value_; }
+
+  void init_padding(device_vec_t<vertex_t>& d_coalesced_v,
+                    device_vec_t<weight_t>& d_coalesced_w) const
+  {
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 d_coalesced_v.begin(),
+                 d_coalesced_v.end(),
+                 vertex_padding_value_);
+
+    thrust::fill(rmm::exec_policy(handle_.get_stream_view()),
+                 d_coalesced_w.begin(),
+                 d_coalesced_w.end(),
+                 weight_padding_value_);
+  }
+
+  decltype(auto) get_handle(void) const { return handle_; }
+
+ private:
+  raft::handle_t const& handle_;
+  index_t num_paths_;
+  index_t max_depth_;
+  device_vec_t<edge_t> d_cached_out_degs_;
+  vertex_t const vertex_padding_value_;
+  weight_t const weight_padding_value_;
+};
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Single-GPU specialization.
+ *
+ * @tparam graph_t Type of graph (view).
+ * @tparam traversal_t Traversal policy. Either horizontal (faster but requires more memory) or
+ * vertical. Defaults to horizontal.
+ * @tparam random_engine_t Type of random engine used to generate RW.
+ * @tparam seeding_policy_t Random engine seeding policy: variable or fixed (for reproducibility).
+ * Defaults to variable, clock dependent.
+ * @tparam index_t Indexing type. Defaults to edge_type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph object to generate RW on.
+ * @param d_v_start Device (view) set of starting vertex indices for the RW.
+ * number(paths) == d_v_start.size().
+ * @param max_depth maximum length of RWs.
+ * @param use_padding (optional) specifies if return uses padded format (true), or coalesced
+ * (compressed) format; when padding is used the output is a matrix of vertex paths and a matrix of
+ * edges paths (weights); in this case the matrices are stored in row major order; the vertex path
+ * matrix is padded with `num_vertices` values and the weight matrix is padded with `0` values;
+ * @param seeder (optional) is object providing the random seeding mechanism. Defaults to local
+ * clock time as initial seed.
+ * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
+ * device_vec_t<index_t>> Triplet of either padded or coalesced RW paths; in the coalesced case
+ * (default), the return consists of corresponding vertex and edge weights for each, and
+ * corresponding path sizes. This is meant to minimize the number of DF's to be passed to the Python
+ * layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes is
+ * represented as a 1D contiguous array. In the padded case the return is a matrix of num_paths x
+ * max_depth vertex paths; and num_paths x (max_depth-1) edge (weight) paths, with an empty array of
+ * sizes. Note: if the graph is un-weighted the edge (weight) paths consists of `weight_t{1}`
+ * entries;
+ */
+template <typename graph_t,
+          typename traversal_t = horizontal_traversal_t,
+          typename random_engine_t =
+            rrandom_gen_t<typename graph_t::vertex_type, typename graph_t::edge_type>,
+          typename seeding_policy_t = clock_seeding_t<typename random_engine_t::seed_type>,
+          typename index_t          = typename graph_t::edge_type>
+std::enable_if_t<graph_t::is_multi_gpu == false,
+                 std::tuple<device_vec_t<typename graph_t::vertex_type>,
+                            device_vec_t<typename graph_t::weight_type>,
+                            device_vec_t<index_t>,
+                            typename random_engine_t::seed_type>>
+random_walks_impl(raft::handle_t const& handle,
+                  graph_t const& graph,
+                  device_const_vector_view<typename graph_t::vertex_type, index_t>& d_v_start,
+                  index_t max_depth,
+                  bool use_padding        = false,
+                  seeding_policy_t seeder = clock_seeding_t<typename random_engine_t::seed_type>{})
+{
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+  using seed_t   = typename random_engine_t::seed_type;
+  using real_t   = typename random_engine_t::real_type;
+
+  vertex_t num_vertices = graph.get_number_of_vertices();
+
+  auto how_many_valid = thrust::count_if(rmm::exec_policy(handle.get_stream_view()),
+                                         d_v_start.begin(),
+                                         d_v_start.end(),
+                                         [num_vertices] __device__(auto crt_vertex) {
+                                           return (crt_vertex >= 0) && (crt_vertex < num_vertices);
+                                         });
+
+  CUGRAPH_EXPECTS(static_cast<index_t>(how_many_valid) == d_v_start.size(),
+                  "Invalid set of starting vertices.");
+
+  auto num_paths = d_v_start.size();
+  auto stream    = handle.get_stream();
+
+  random_walker_t<graph_t, random_engine_t> rand_walker{
+    handle, graph, static_cast<index_t>(num_paths), static_cast<index_t>(max_depth)};
+
+  // pre-allocate num_paths * max_depth;
+  //
+  auto coalesced_sz = num_paths * max_depth;
+  device_vec_t<vertex_t> d_coalesced_v(coalesced_sz, stream);  // coalesced vertex set
+  device_vec_t<weight_t> d_coalesced_w(coalesced_sz, stream);  // coalesced weight set
+  device_vec_t<index_t> d_paths_sz(num_paths, stream);         // paths sizes
+
+  // traversal policy:
+  //
+  traversal_t traversor(num_paths, max_depth);
+
+  auto tmp_buff_sz = traversor.get_tmp_buff_sz();
+
+  device_vec_t<edge_t> d_crt_out_degs(tmp_buff_sz, stream);  // crt vertex set out-degs
+  device_vec_t<vertex_t> d_col_indx(tmp_buff_sz, stream);    // \in {0,..,out-deg(v)}
+  device_vec_t<vertex_t> d_next_v(tmp_buff_sz, stream);      // crt set of next vertices
+  device_vec_t<weight_t> d_next_w(tmp_buff_sz, stream);      // crt set of next weights
+
+  // random data handling:
+  //
+  auto rnd_data_sz = traversor.get_random_buff_sz();
+  device_vec_t<real_t> d_random(rnd_data_sz, stream);
+  // abstracted out seed initialization:
+  //
+  seed_t seed0 = static_cast<seed_t>(seeder());
+
+  // if padding used, initialize padding values:
+  //
+  if (use_padding) rand_walker.init_padding(d_coalesced_v, d_coalesced_w);
+
+  // very first vertex, for each path:
+  //
+  rand_walker.start(d_v_start, d_coalesced_v, d_paths_sz);
+
+  // traverse paths:
+  //
+  traversor(graph,
+            rand_walker,
+            seed0,
+            d_coalesced_v,
+            d_coalesced_w,
+            d_paths_sz,
+            d_crt_out_degs,
+            d_random,
+            d_col_indx,
+            d_next_v,
+            d_next_w);
+
+  // wrap-up, post-process:
+  // truncate v_set, w_set to actual space used
+  // unless padding is used
+  //
+  if (!use_padding) { rand_walker.stop(d_coalesced_v, d_coalesced_w, d_paths_sz); }
+
+  // because device_uvector is not copy-cnstr-able:
+  //
+  if (!use_padding) {
+    return std::make_tuple(std::move(d_coalesced_v),
+                           std::move(d_coalesced_w),
+                           std::move(d_paths_sz),
+                           seed0);  // also return seed for repro
+  } else {
+    return std::make_tuple(
+      std::move(d_coalesced_v),
+      std::move(d_coalesced_w),
+      device_vec_t<index_t>(0, stream),  // purposely empty size array for the padded case, to avoid
+                                         // unnecessary allocations
+      seed0);                            // also return seed for repro
+  }
+}
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Multi-GPU specialization.
+ *
+ * @tparam graph_t Type of graph (view).
+ * @tparam traversal_t Traversal policy. Either horizontal (faster but requires more memory) or
+ * vertical. Defaults to horizontal.
+ * @tparam random_engine_t Type of random engine used to generate RW.
+ * @tparam seeding_policy_t Random engine seeding policy: variable or fixed (for reproducibility).
+ * Defaults to variable, clock dependent.
+ * @tparam index_t Indexing type. Defaults to edge_type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph object to generate RW on.
+ * @param d_v_start Device (view) set of starting vertex indices for the RW. number(RW) ==
+ * d_v_start.size().
+ * @param max_depth maximum length of RWs.
+ * @param use_padding (optional) specifies if return uses padded format (true), or coalesced
+ * (compressed) format; when padding is used the output is a matrix of vertex paths and a matrix of
+ * edges paths (weights); in this case the matrices are stored in row major order; the vertex path
+ * matrix is padded with `num_vertices` values and the weight matrix is padded with `0` values;
+ * @param seeder (optional) is object providing the random seeding mechanism. Defaults to local
+ * clock time as initial seed.
+ * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
+ * device_vec_t<index_t>> Triplet of either padded or coalesced RW paths; in the coalesced case
+ * (default), the return consists of corresponding vertex and edge weights for each, and
+ * corresponding path sizes. This is meant to minimize the number of DF's to be passed to the Python
+ * layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes is
+ * represented as a 1D contiguous array. In the padded case the return is a matrix of num_paths x
+ * max_depth vertex paths; and num_paths x (max_depth-1) edge (weight) paths, with an empty array of
+ * sizes. Note: if the graph is un-weighted the edge (weight) paths consists of `weight_t{1}`
+ * entries;
+ */
+template <typename graph_t,
+          typename traversal_t = horizontal_traversal_t,
+          typename random_engine_t =
+            rrandom_gen_t<typename graph_t::vertex_type, typename graph_t::edge_type>,
+          typename seeding_policy_t = clock_seeding_t<typename random_engine_t::seed_type>,
+          typename index_t          = typename graph_t::edge_type>
+std::enable_if_t<graph_t::is_multi_gpu == true,
+                 std::tuple<device_vec_t<typename graph_t::vertex_type>,
+                            device_vec_t<typename graph_t::weight_type>,
+                            device_vec_t<index_t>,
+                            typename random_engine_t::seed_type>>
+random_walks_impl(raft::handle_t const& handle,
+                  graph_t const& graph,
+                  device_const_vector_view<typename graph_t::vertex_type, index_t>& d_v_start,
+                  index_t max_depth,
+                  bool use_padding        = false,
+                  seeding_policy_t seeder = clock_seeding_t<typename random_engine_t::seed_type>{})
+{
+  CUGRAPH_FAIL("Not implemented yet.");
+}
+
+// provides conversion to (coalesced) path to COO format:
+// (which in turn provides an API consistent with egonet)
+//
+template <typename vertex_t, typename index_t>
+struct coo_convertor_t {
+  coo_convertor_t(raft::handle_t const& handle, index_t num_paths)
+    : handle_(handle), num_paths_(num_paths)
+  {
+  }
+
+  std::tuple<device_vec_t<vertex_t>, device_vec_t<vertex_t>, device_vec_t<index_t>> operator()(
+    device_const_vector_view<vertex_t>& d_coalesced_v,
+    device_const_vector_view<index_t>& d_sizes) const
+  {
+    CUGRAPH_EXPECTS(static_cast<index_t>(d_sizes.size()) == num_paths_, "Invalid size vector.");
+
+    auto tupl_fill        = fill_stencil(d_sizes);
+    auto&& d_stencil      = std::move(std::get<0>(tupl_fill));
+    auto total_sz_v       = std::get<1>(tupl_fill);
+    auto&& d_sz_incl_scan = std::move(std::get<2>(tupl_fill));
+
+    CUGRAPH_EXPECTS(static_cast<index_t>(d_coalesced_v.size()) == total_sz_v,
+                    "Inconsistent vertex coalesced size data.");
+
+    auto src_dst_tpl = gather_pairs(d_coalesced_v, d_stencil, total_sz_v);
+
+    auto&& d_src = std::move(std::get<0>(src_dst_tpl));
+    auto&& d_dst = std::move(std::get<1>(src_dst_tpl));
+
+    device_vec_t<index_t> d_sz_w_scan(num_paths_, handle_.get_stream());
+
+    // copy vertex path sizes that are > 1:
+    // (because vertex_path_sz translates
+    //  into edge_path_sz = vertex_path_sz - 1,
+    //  and edge_paths_sz == 0 don't contribute
+    //  anything):
+    //
+    auto new_end_it = thrust::copy_if(rmm::exec_policy(handle_.get_stream_view()),
+                                      d_sizes.begin(),
+                                      d_sizes.end(),
+                                      d_sz_w_scan.begin(),
+                                      [] __device__(auto sz_value) { return sz_value > 1; });
+
+    // resize to new_end:
+    //
+    d_sz_w_scan.resize(thrust::distance(d_sz_w_scan.begin(), new_end_it), handle_.get_stream());
+
+    // get paths' edge number exclusive scan
+    // by transforming paths' vertex numbers that
+    // are > 1, via tranaformation:
+    // edge_path_sz = (vertex_path_sz-1):
+    //
+    thrust::transform_exclusive_scan(
+      rmm::exec_policy(handle_.get_stream_view()),
+      d_sz_w_scan.begin(),
+      d_sz_w_scan.end(),
+      d_sz_w_scan.begin(),
+      [] __device__(auto sz) { return sz - 1; },
+      index_t{0},
+      thrust::plus<index_t>{});
+
+    return std::make_tuple(std::move(d_src), std::move(d_dst), std::move(d_sz_w_scan));
+  }
+
+  std::tuple<device_vec_t<int>, index_t, device_vec_t<index_t>> fill_stencil(
+    device_const_vector_view<index_t>& d_sizes) const
+  {
+    device_vec_t<index_t> d_scan(num_paths_, handle_.get_stream());
+    thrust::inclusive_scan(
+      rmm::exec_policy(handle_.get_stream_view()), d_sizes.begin(), d_sizes.end(), d_scan.begin());
+
+    index_t total_sz{0};
+    CUDA_TRY(cudaMemcpy(
+      &total_sz, raw_ptr(d_scan) + num_paths_ - 1, sizeof(index_t), cudaMemcpyDeviceToHost));
+
+    device_vec_t<int> d_stencil(total_sz, handle_.get_stream());
+
+    // initialize stencil to all 1's:
+    //
+    thrust::copy_n(rmm::exec_policy(handle_.get_stream_view()),
+                   thrust::make_constant_iterator<int>(1),
+                   d_stencil.size(),
+                   d_stencil.begin());
+
+    // set to 0 entries positioned at inclusive_scan(sizes[]),
+    // because those are path "breakpoints", where a path end
+    // and the next one starts, hence there cannot be an edge
+    // between a path ending vertex and next path starting vertex;
+    //
+    thrust::scatter(rmm::exec_policy(handle_.get_stream_view()),
+                    thrust::make_constant_iterator(0),
+                    thrust::make_constant_iterator(0) + num_paths_,
+                    d_scan.begin(),
+                    d_stencil.begin());
+
+    return std::make_tuple(std::move(d_stencil), total_sz, std::move(d_scan));
+  }
+
+  std::tuple<device_vec_t<vertex_t>, device_vec_t<vertex_t>> gather_pairs(
+    device_const_vector_view<vertex_t>& d_coalesced_v,
+    device_vec_t<int> const& d_stencil,
+    index_t total_sz_v) const
+  {
+    auto total_sz_w = total_sz_v - num_paths_;
+    device_vec_t<index_t> valid_src_indx(total_sz_w, handle_.get_stream());
+
+    // generate valid vertex src indices,
+    // which is any index in {0,...,total_sz_v - 2}
+    // provided the next index position; i.e., (index+1),
+    // in stencil is not 0; (if it is, there's no "next"
+    // or dst index, because the path has ended);
+    //
+    thrust::copy_if(rmm::exec_policy(handle_.get_stream_view()),
+                    thrust::make_counting_iterator<index_t>(0),
+                    thrust::make_counting_iterator<index_t>(total_sz_v - 1),
+                    valid_src_indx.begin(),
+                    [ptr_d_stencil = raw_const_ptr(d_stencil)] __device__(auto indx) {
+                      auto dst_indx = indx + 1;
+                      return ptr_d_stencil[dst_indx] == 1;
+                    });
+
+    device_vec_t<vertex_t> d_src_v(total_sz_w, handle_.get_stream());
+    device_vec_t<vertex_t> d_dst_v(total_sz_w, handle_.get_stream());
+
+    // construct pair of src[], dst[] by gathering
+    // from d_coalesced_v all pairs
+    // at entries (valid_src_indx, valid_src_indx+1),
+    // where the set of valid_src_indx was
+    // generated at the previous step;
+    //
+    thrust::transform(
+      rmm::exec_policy(handle_.get_stream_view()),
+      valid_src_indx.begin(),
+      valid_src_indx.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin())),  // start_zip
+      [ptr_d_vertex = raw_const_ptr(d_coalesced_v)] __device__(auto indx) {
+        return thrust::make_tuple(ptr_d_vertex[indx], ptr_d_vertex[indx + 1]);
+      });
+
+    return std::make_tuple(std::move(d_src_v), std::move(d_dst_v));
+  }
+
+ private:
+  raft::handle_t const& handle_;
+  index_t num_paths_;
+};
+
+}  // namespace detail
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Uniform distribution is assumed for the random engine.
+ *
+ * @tparam graph_t Type of graph/view (typically, graph_view_t).
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph (view )object to generate RW on.
+ * @param ptr_d_start Device pointer to set of starting vertex indices for the RW.
+ * @param num_paths = number(paths).
+ * @param max_depth maximum length of RWs.
+ * @param use_padding (optional) specifies if return uses padded format (true), or coalesced
+ * (compressed) format; when padding is used the output is a matrix of vertex paths and a matrix of
+ * edges paths (weights); in this case the matrices are stored in row major order; the vertex path
+ * matrix is padded with `num_vertices` values and the weight matrix is padded with `0` values;
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>,
+ * rmm::device_uvector<index_t>> Triplet of either padded or coalesced RW paths; in the coalesced
+ * case (default), the return consists of corresponding vertex and edge weights for each, and
+ * corresponding path sizes. This is meant to minimize the number of DF's to be passed to the Python
+ * layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes is
+ * represented as a 1D contiguous array. In the padded case the return is a matrix of num_paths x
+ * max_depth vertex paths; and num_paths x (max_depth-1) edge (weight) paths, with an empty array of
+ * sizes. Note: if the graph is un-weighted the edge (weight) paths consists of `weight_t{1}`
+ * entries;
+ */
+template <typename graph_t, typename index_t>
+std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
+           rmm::device_uvector<typename graph_t::weight_type>,
+           rmm::device_uvector<index_t>>
+random_walks(raft::handle_t const& handle,
+             graph_t const& graph,
+             typename graph_t::vertex_type const* ptr_d_start,
+             index_t num_paths,
+             index_t max_depth,
+             bool use_padding)
+{
+  using vertex_t     = typename graph_t::vertex_type;
+  using edge_t       = typename graph_t::edge_type;
+  using weight_t     = typename graph_t::weight_type;
+  using rnd_engine_t = float;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_v_start{ptr_d_start, num_paths};
+
+  // GPU memory availability:
+  //
+  size_t free_mem_sp_bytes{0};
+  size_t total_mem_sp_bytes{0};
+  cudaMemGetInfo(&free_mem_sp_bytes, &total_mem_sp_bytes);
+
+  // GPU memory requirements:
+  //
+  size_t coalesced_v_count = num_paths * max_depth;
+  auto coalesced_e_count   = coalesced_v_count - num_paths;
+  size_t req_mem_common    = sizeof(vertex_t) * coalesced_v_count +
+                          sizeof(weight_t) * coalesced_e_count +  // coalesced_v + coalesced_w
+                          (sizeof(vertex_t) + sizeof(index_t)) * num_paths;  // start_v + sizes
+
+  size_t req_mem_horizontal =
+    req_mem_common + sizeof(rnd_engine_t) * coalesced_e_count;  // + rnd_buff
+  size_t req_mem_vertical = req_mem_common + (sizeof(edge_t) + 2 * sizeof(vertex_t) +
+                                              sizeof(weight_t) + sizeof(rnd_engine_t)) *
+                                               num_paths;  // + smaller_rnd_buff + tmp_buffs
+
+  bool use_vertical_strategy{false};
+  if (req_mem_horizontal > req_mem_vertical && req_mem_horizontal > free_mem_sp_bytes) {
+    use_vertical_strategy = true;
+    std::cerr
+      << "WARNING: Due to GPU memory availability, slower vertical traversal will be used.\n";
+  }
+
+  if (use_vertical_strategy) {
+    auto quad_tuple = detail::random_walks_impl<graph_t, detail::vertical_traversal_t>(
+      handle, graph, d_v_start, max_depth, use_padding);
+    // ignore last element of the quad, seed,
+    // since it's meant for testing / debugging, only:
+    //
+    return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                           std::move(std::get<1>(quad_tuple)),
+                           std::move(std::get<2>(quad_tuple)));
+  } else {
+    auto quad_tuple = detail::random_walks_impl(handle, graph, d_v_start, max_depth, use_padding);
+    // ignore last element of the quad, seed,
+    // since it's meant for testing / debugging, only:
+    //
+    return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                           std::move(std::get<1>(quad_tuple)),
+                           std::move(std::get<2>(quad_tuple)));
+  }
+}
+
+/**
+ * @brief returns the COO format (src_vector, dst_vector) from the random walks (RW)
+ * paths.
+ *
+ * @tparam vertex_t Type of vertex indices.
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param coalesced_sz_v coalesced vertex vector size.
+ * @param num_paths number of paths.
+ * @param d_coalesced_v coalesced vertex buffer.
+ * @param d_sizes paths size buffer.
+ * @return tuple of (src_vertex_vector, dst_Vertex_vector, path_offsets), where
+ * path_offsets are the offsets where the COO set of each path starts.
+ */
+template <typename vertex_t, typename index_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<index_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       index_t coalesced_sz_v,
+                       index_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes)
+{
+  detail::coo_convertor_t<vertex_t, index_t> to_coo(handle, num_paths);
+
+  detail::device_const_vector_view<vertex_t> d_v_view(
+    static_cast<vertex_t const*>(d_coalesced_v.data()), coalesced_sz_v);
+
+  detail::device_const_vector_view<index_t> d_sz_view(static_cast<index_t const*>(d_sizes.data()),
+                                                      num_paths);
+
+  return to_coo(d_v_view, d_sz_view);
+}
+
+/**
+ * @brief returns additional RW information on vertex paths offsets and weight path sizes and
+ * offsets, for the coalesced case (the padded case does not need or provide this information)
+ *
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param num_paths number of paths.
+ * @param ptr_d_sizes sizes of vertex paths.
+ * @return tuple of (vertex_path_offsets, weight_path_sizes, weight_path_offsets), where offsets are
+ * exclusive scan of corresponding sizes.
+ */
+template <typename index_t>
+std::tuple<rmm::device_uvector<index_t>, rmm::device_uvector<index_t>, rmm::device_uvector<index_t>>
+query_rw_sizes_offsets(raft::handle_t const& handle, index_t num_paths, index_t const* ptr_d_sizes)
+{
+  rmm::device_uvector<index_t> d_vertex_offsets(num_paths, handle.get_stream());
+  rmm::device_uvector<index_t> d_weight_sizes(num_paths, handle.get_stream());
+  rmm::device_uvector<index_t> d_weight_offsets(num_paths, handle.get_stream());
+
+  thrust::exclusive_scan(rmm::exec_policy(handle.get_stream_view()),
+                         ptr_d_sizes,
+                         ptr_d_sizes + num_paths,
+                         d_vertex_offsets.begin());
+
+  thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+                    ptr_d_sizes,
+                    ptr_d_sizes + num_paths,
+                    d_weight_sizes.begin(),
+                    [] __device__(auto vertex_path_sz) { return vertex_path_sz - 1; });
+
+  handle.get_stream_view().synchronize();
+
+  thrust::exclusive_scan(rmm::exec_policy(handle.get_stream_view()),
+                         d_weight_sizes.begin(),
+                         d_weight_sizes.end(),
+                         d_weight_offsets.begin());
+
+  return std::make_tuple(
+    std::move(d_vertex_offsets), std::move(d_weight_sizes), std::move(d_weight_offsets));
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
new file mode 100644
index 00000000000..b2ba74e97a2
--- /dev/null
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+
+#include <utilities/graph_utils.cuh>
+
+#include <raft/device_atomics.cuh>
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+#include <algorithm>
+#include <future>
+#include <thread>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename T>
+using device_vec_t = rmm::device_uvector<T>;
+
+template <typename T>
+using device_v_it = typename device_vec_t<T>::iterator;
+
+template <typename value_t>
+value_t* raw_ptr(device_vec_t<value_t>& dv)
+{
+  return dv.data();
+}
+
+template <typename value_t>
+value_t const* raw_const_ptr(device_vec_t<value_t> const& dv)
+{
+  return dv.data();
+}
+
+template <typename value_t, typename index_t = size_t>
+struct device_const_vector_view {
+  device_const_vector_view(value_t const* d_buffer, index_t size) : d_buffer_(d_buffer), size_(size)
+  {
+  }
+
+  device_const_vector_view(device_const_vector_view const& other) = delete;
+  device_const_vector_view& operator=(device_const_vector_view const& other) = delete;
+
+  device_const_vector_view(device_const_vector_view&& other)
+  {
+    d_buffer_ = other.d_buffer_;
+    size_     = other.size_;
+  }
+  device_const_vector_view& operator=(device_const_vector_view&& other)
+  {
+    d_buffer_ = other.d_buffer_;
+    size_     = other.size_;
+
+    return *this;
+  }
+
+  value_t const* begin(void) const { return d_buffer_; }
+
+  value_t const* end() const { return d_buffer_ + size_; }
+
+  index_t size(void) const { return size_; }
+
+ private:
+  value_t const* d_buffer_{nullptr};
+  index_t size_;
+};
+
+template <typename value_t>
+value_t const* raw_const_ptr(device_const_vector_view<value_t>& dv)
+{
+  return dv.begin();
+}
+
+// classes abstracting the way the random walks path are generated:
+//
+
+// vertical traversal proxy:
+// a device vector of next vertices is generated for each path;
+// when a vertex is a sink the corresponding path doesn't advance anymore;
+//
+// smaller memory footprint;
+//
+struct vertical_traversal_t {
+  vertical_traversal_t(size_t num_paths, size_t max_depth)
+    : num_paths_(num_paths), max_depth_(max_depth)
+  {
+  }
+
+  template <typename graph_t,
+            typename random_walker_t,
+            typename index_t,
+            typename real_t,
+            typename seed_t>
+  void operator()(
+    graph_t const& graph,                // graph being traversed
+    random_walker_t const& rand_walker,  // random walker object for which traversal is driven
+    seed_t seed0,                        // initial seed value
+    device_vec_t<typename graph_t::vertex_type>& d_coalesced_v,  // crt coalesced vertex set
+    device_vec_t<typename graph_t::weight_type>& d_coalesced_w,  // crt coalesced weight set
+    device_vec_t<index_t>& d_paths_sz,                           // crt paths sizes
+    device_vec_t<typename graph_t::edge_type>&
+      d_crt_out_degs,                // crt out-degs for current set of vertices
+    device_vec_t<real_t>& d_random,  // crt set of random real values
+    device_vec_t<typename graph_t::vertex_type>&
+      d_col_indx,  // crt col col indices to be used for retrieving next step
+    device_vec_t<typename graph_t::vertex_type>&
+      d_next_v,  // crt set of destination vertices, for next step
+    device_vec_t<typename graph_t::weight_type>&
+      d_next_w)  // set of weights between src and destination vertices, for next step
+    const
+  {
+    // start from 1, as 0-th was initialized above:
+    //
+    for (decltype(max_depth_) step_indx = 1; step_indx < max_depth_; ++step_indx) {
+      // take one-step in-sync for each path in parallel:
+      //
+      rand_walker.step(graph,
+                       seed0 + static_cast<seed_t>(step_indx),
+                       d_coalesced_v,
+                       d_coalesced_w,
+                       d_paths_sz,
+                       d_crt_out_degs,
+                       d_random,
+                       d_col_indx,
+                       d_next_v,
+                       d_next_w);
+
+      // early exit: all paths have reached sinks:
+      //
+      if (rand_walker.all_paths_stopped(d_crt_out_degs)) break;
+    }
+  }
+
+  size_t get_random_buff_sz(void) const { return num_paths_; }
+  size_t get_tmp_buff_sz(void) const { return num_paths_; }
+
+ private:
+  size_t num_paths_;
+  size_t max_depth_;
+};
+
+// horizontal traversal proxy:
+// each path is generated independently from start to finish;
+// when a vertex is a sink the corresponding path doesn't advance anymore;
+// requires (num_paths x max_depth) precomputed real random values in [0,1];
+//
+// larger memory footprint, but potentially more efficient;
+//
+struct horizontal_traversal_t {
+  horizontal_traversal_t(size_t num_paths, size_t max_depth)
+    : num_paths_(num_paths), max_depth_(max_depth)
+  {
+  }
+
+  template <typename graph_t,
+            typename random_walker_t,
+            typename index_t,
+            typename real_t,
+            typename seed_t>
+  void operator()(
+    graph_t const& graph,                // graph being traversed
+    random_walker_t const& rand_walker,  // random walker object for which traversal is driven
+    seed_t seed0,                        // initial seed value
+    device_vec_t<typename graph_t::vertex_type>& d_coalesced_v,  // crt coalesced vertex set
+    device_vec_t<typename graph_t::weight_type>& d_coalesced_w,  // crt coalesced weight set
+    device_vec_t<index_t>& d_paths_sz,                           // crt paths sizes
+    device_vec_t<typename graph_t::edge_type>&
+      d_crt_out_degs,                // ignored: out-degs for the current set of vertices
+    device_vec_t<real_t>& d_random,  // _entire_ set of random real values
+    device_vec_t<typename graph_t::vertex_type>&
+      d_col_indx,  // ignored: crt col indices to be used for retrieving next step
+    device_vec_t<typename graph_t::vertex_type>&
+      d_next_v,  // ignored: crt set of destination vertices, for next step (coalesced set
+                 // updated directly, instead)
+    device_vec_t<typename graph_t::weight_type>&
+      d_next_w)  // ignored: set of weights between src and destination vertices, for next step
+                 // (coalesced set updated directly, instead)
+    const
+  {
+    using vertex_t        = typename graph_t::vertex_type;
+    using edge_t          = typename graph_t::edge_type;
+    using weight_t        = typename graph_t::weight_type;
+    using random_engine_t = typename random_walker_t::rnd_engine_t;
+
+    auto const& handle = rand_walker.get_handle();
+    auto* ptr_d_random = raw_ptr(d_random);
+
+    random_engine_t::generate_random(handle, ptr_d_random, d_random.size(), seed0);
+
+    auto const* col_indices       = graph.get_matrix_partition_view().get_indices();
+    auto const* row_offsets       = graph.get_matrix_partition_view().get_offsets();
+    auto const* values            = graph.get_matrix_partition_view().get_weights()
+                                      ? *(graph.get_matrix_partition_view().get_weights())
+                                      : static_cast<weight_t*>(nullptr);
+    auto* ptr_d_sizes             = raw_ptr(d_paths_sz);
+    auto const& d_cached_out_degs = rand_walker.get_out_degs();
+
+    auto rnd_to_indx_convertor = [] __device__(real_t rnd_vindx, edge_t crt_out_deg) {
+      real_t max_ub     = static_cast<real_t>(crt_out_deg - 1);
+      auto interp_vindx = rnd_vindx * max_ub + real_t{.5};
+      vertex_t v_indx   = static_cast<vertex_t>(interp_vindx);
+      return (v_indx >= crt_out_deg ? crt_out_deg - 1 : v_indx);
+    };
+
+    auto next_vw =
+      [row_offsets,
+       col_indices,
+       values] __device__(auto v_indx,      // src vertex to find dst from
+                          auto col_indx) {  // column index, in {0,...,out_deg(v_indx)-1},
+        // extracted from random value in [0..1]
+        auto start_row = row_offsets[v_indx];
+
+        auto weight_value =
+          (values == nullptr ? weight_t{1}
+                             : values[start_row + col_indx]);  // account for un-weighted graphs
+        return thrust::make_tuple(col_indices[start_row + col_indx], weight_value);
+      };
+
+    // start from 1, as 0-th was initialized above:
+    //
+    thrust::for_each(rmm::exec_policy(handle.get_stream_view()),
+                     thrust::make_counting_iterator<index_t>(0),
+                     thrust::make_counting_iterator<index_t>(num_paths_),
+                     [max_depth            = max_depth_,
+                      ptr_d_cache_out_degs = raw_const_ptr(d_cached_out_degs),
+                      ptr_coalesced_v      = raw_ptr(d_coalesced_v),
+                      ptr_coalesced_w      = raw_ptr(d_coalesced_w),
+                      ptr_d_random,
+                      ptr_d_sizes,
+                      rnd_to_indx_convertor,
+                      next_vw] __device__(auto path_index) {
+                       auto chunk_offset   = path_index * max_depth;
+                       vertex_t src_vertex = ptr_coalesced_v[chunk_offset];
+
+                       for (index_t step_indx = 1; step_indx < max_depth; ++step_indx) {
+                         auto crt_out_deg = ptr_d_cache_out_degs[src_vertex];
+                         if (crt_out_deg == 0) break;
+
+                         // indexing into coalesced arrays of size num_paths x (max_depth -1):
+                         // (d_random, d_coalesced_w)
+                         //
+                         auto stepping_index = chunk_offset - path_index + step_indx - 1;
+
+                         auto real_rnd_indx = ptr_d_random[stepping_index];
+
+                         auto col_indx = rnd_to_indx_convertor(real_rnd_indx, crt_out_deg);
+                         auto pair_vw  = next_vw(src_vertex, col_indx);
+
+                         src_vertex      = thrust::get<0>(pair_vw);
+                         auto crt_weight = thrust::get<1>(pair_vw);
+
+                         ptr_coalesced_v[chunk_offset + step_indx] = src_vertex;
+                         ptr_coalesced_w[stepping_index]           = crt_weight;
+                         ptr_d_sizes[path_index]++;
+                       }
+                     });
+  }
+
+  size_t get_random_buff_sz(void) const { return num_paths_ * (max_depth_ - 1); }
+  size_t get_tmp_buff_sz(void) const
+  {
+    return 0;
+  }  // no need for tmp buffers
+     //(see "ignored" above)
+
+ private:
+  size_t num_paths_;
+  size_t max_depth_;
+};  // namespace detail
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/serialization/serializer.cu b/cpp/src/serialization/serializer.cu
new file mode 100644
index 00000000000..28529c9f3ed
--- /dev/null
+++ b/cpp/src/serialization/serializer.cu
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#include <cugraph/serialization/serializer.hpp>
+
+#include <utilities/graph_utils.cuh>
+
+#include <raft/device_atomics.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace serializer {
+
+template <typename value_t>
+void serializer_t::serialize(value_t val)
+{
+  auto byte_buff_sz = sizeof(value_t);
+  auto it_end       = begin_ + byte_buff_sz;
+
+  raft::update_device(
+    begin_, reinterpret_cast<byte_t const*>(&val), byte_buff_sz, handle_.get_stream());
+
+  begin_ = it_end;
+}
+
+template <typename value_t>
+value_t serializer_t::unserialize(void)
+{
+  value_t val{};
+  auto byte_buff_sz = sizeof(value_t);
+
+  raft::update_host(&val, reinterpret_cast<value_t const*>(cbegin_), 1, handle_.get_stream());
+
+  cbegin_ += byte_buff_sz;
+  return val;
+}
+
+template <typename value_t>
+void serializer_t::serialize(value_t const* p_d_src, size_t size)
+{
+  auto byte_buff_sz       = size * sizeof(value_t);
+  auto it_end             = begin_ + byte_buff_sz;
+  byte_t const* byte_buff = reinterpret_cast<byte_t const*>(p_d_src);
+
+  thrust::copy_n(rmm::exec_policy(handle_.get_stream_view()), byte_buff, byte_buff_sz, begin_);
+
+  begin_ = it_end;
+}
+
+template <typename value_t>
+rmm::device_uvector<value_t> serializer_t::unserialize(size_t size)
+{
+  auto byte_buff_sz = size * sizeof(value_t);
+  rmm::device_uvector<value_t> d_dest(size, handle_.get_stream());
+  byte_t* byte_buff = reinterpret_cast<byte_t*>(d_dest.data());
+
+  thrust::copy_n(rmm::exec_policy(handle_.get_stream_view()), cbegin_, byte_buff_sz, byte_buff);
+
+  cbegin_ += byte_buff_sz;
+  return d_dest;
+}
+
+// serialization of graph metadata, via device orchestration:
+//
+template <typename graph_t>
+void serializer_t::serialize(serializer_t::graph_meta_t<graph_t> const& gmeta)
+{
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  if constexpr (!graph_t::is_multi_gpu) {
+    using bool_t = typename graph_meta_t<graph_t>::bool_ser_t;
+
+    serialize(gmeta.num_vertices_);
+    serialize(gmeta.num_edges_);
+    serialize(static_cast<bool_t>(gmeta.properties_.is_symmetric));
+    serialize(static_cast<bool_t>(gmeta.properties_.is_multigraph));
+    serialize(static_cast<bool_t>(gmeta.is_weighted_));
+
+    auto seg_off_sz_bytes =
+      (gmeta.segment_offsets_ ? (*(gmeta.segment_offsets_)).size() : size_t{0}) * sizeof(vertex_t);
+    if (seg_off_sz_bytes > 0) {
+      auto it_end = begin_ + seg_off_sz_bytes;
+
+      raft::update_device(begin_,
+                          reinterpret_cast<byte_t const*>((*(gmeta.segment_offsets_)).data()),
+                          seg_off_sz_bytes,
+                          handle_.get_stream());
+
+      begin_ = it_end;
+    }
+
+  } else {
+    CUGRAPH_FAIL("Unsupported graph type for serialization.");
+  }
+}
+
+// unserialization of graph metadata, via device orchestration:
+//
+template <typename graph_t>
+serializer_t::graph_meta_t<graph_t> serializer_t::unserialize(
+  size_t graph_meta_sz_bytes,
+  serializer_t::graph_meta_t<graph_t> const& empty_meta)  // tag dispatching parameter
+{
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  if constexpr (!graph_t::is_multi_gpu) {
+    using bool_t = typename graph_meta_t<graph_t>::bool_ser_t;
+
+    CUGRAPH_EXPECTS(graph_meta_sz_bytes >= 2 * sizeof(size_t) + 3 * sizeof(bool_t),
+                    "Un/serialization meta size mismatch.");
+
+    size_t num_vertices  = unserialize<size_t>();
+    size_t num_edges     = unserialize<size_t>();
+    bool_t is_symmetric  = unserialize<bool_t>();
+    bool_t is_multigraph = unserialize<bool_t>();
+    bool_t is_weighted   = unserialize<bool_t>();
+
+    graph_properties_t properties{static_cast<bool>(is_symmetric),
+                                  static_cast<bool>(is_multigraph)};
+
+    std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+
+    size_t seg_off_sz_bytes = graph_meta_sz_bytes - 2 * sizeof(size_t) - 3 * sizeof(bool_t);
+
+    if (seg_off_sz_bytes > 0) {
+      segment_offsets = std::vector<vertex_t>(seg_off_sz_bytes / sizeof(vertex_t), vertex_t{0});
+      raft::update_host((*segment_offsets).data(),
+                        reinterpret_cast<vertex_t const*>(cbegin_),
+                        seg_off_sz_bytes,
+                        handle_.get_stream());
+
+      cbegin_ += seg_off_sz_bytes;
+    }
+
+    return graph_meta_t<graph_t>{
+      num_vertices, num_edges, properties, static_cast<bool>(is_weighted), segment_offsets};
+
+  } else {
+    CUGRAPH_FAIL("Unsupported graph type for unserialization.");
+    return graph_meta_t<graph_t>{};
+  }
+}
+
+// graph serialization:
+// metadata argument (gvmeta) can be used for checking / testing;
+//
+template <typename graph_t>
+void serializer_t::serialize(graph_t const& graph, serializer_t::graph_meta_t<graph_t>& gvmeta)
+{
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  if constexpr (!graph_t::is_multi_gpu) {
+    size_t num_vertices = graph.get_number_of_vertices();
+    size_t num_edges    = graph.get_number_of_edges();
+    auto&& gview        = graph.view();
+
+    gvmeta = graph_meta_t<graph_t>{graph};
+
+    auto offsets = gview.get_matrix_partition_view().get_offsets();
+    auto indices = gview.get_matrix_partition_view().get_indices();
+    auto weights = gview.get_matrix_partition_view().get_weights();
+
+    // FIXME: remove when host_bcast() becomes available for vectors;
+    //
+    // for now, this must come first, because unserialize()
+    // needs it at the beginning to extract graph metadata
+    // to be able to finish the rest of the graph unserialization;
+    //
+    serialize(gvmeta);
+
+    serialize(offsets, num_vertices + 1);
+    serialize(indices, num_edges);
+
+    if (weights) serialize(*weights, num_edges);
+
+  } else {
+    CUGRAPH_FAIL("Unsupported graph type for serialization.");
+  }
+}
+
+// graph unserialization:
+//
+template <typename graph_t>
+graph_t serializer_t::unserialize(size_t device_sz_bytes, size_t host_sz_bytes)
+{
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  if constexpr (!graph_t::is_multi_gpu) {
+    graph_meta_t<graph_t> empty_meta{};  // tag-dispatching only
+
+    // FIXME: remove when host_bcast() becomes available for vectors;
+    //
+    // for now, this must come first, because unserialize()
+    // needs it at the beginning to extract graph metadata
+    // to be able to finish the rest of the graph unserialization;
+    //
+    auto gvmeta = unserialize(host_sz_bytes, empty_meta);
+
+    auto pair_sz = get_device_graph_sz_bytes(gvmeta);
+
+    CUGRAPH_EXPECTS((pair_sz.first == device_sz_bytes) && (pair_sz.second == host_sz_bytes),
+                    "Un/serialization size mismatch.");
+
+    vertex_t num_vertices = gvmeta.num_vertices_;
+    edge_t num_edges      = gvmeta.num_edges_;
+    auto g_props          = gvmeta.properties_;
+    auto is_weighted      = gvmeta.is_weighted_;
+    auto seg_offsets      = gvmeta.segment_offsets_;
+
+    auto d_offsets = unserialize<edge_t>(num_vertices + 1);
+    auto d_indices = unserialize<vertex_t>(num_edges);
+
+    return graph_t(
+      handle_,
+      num_vertices,
+      num_edges,
+      g_props,
+      std::move(d_offsets),
+      std::move(d_indices),
+      is_weighted ? std::optional<rmm::device_uvector<weight_t>>{unserialize<weight_t>(num_edges)}
+                  : std::nullopt,
+      std::move(seg_offsets));  // RVO-ed
+  } else {
+    CUGRAPH_FAIL("Unsupported graph type for unserialization.");
+
+    return graph_t{handle_};
+  }
+}
+
+// Manual template instantiations (EIDir's):
+//
+template void serializer_t::serialize(int32_t const* p_d_src, size_t size);
+template void serializer_t::serialize(int64_t const* p_d_src, size_t size);
+template void serializer_t::serialize(float const* p_d_src, size_t size);
+template void serializer_t::serialize(double const* p_d_src, size_t size);
+
+template rmm::device_uvector<int32_t> serializer_t::unserialize(size_t size);
+template rmm::device_uvector<int64_t> serializer_t::unserialize(size_t size);
+template rmm::device_uvector<float> serializer_t::unserialize(size_t size);
+template rmm::device_uvector<double> serializer_t::unserialize(size_t size);
+
+// serialize graph:
+//
+template void serializer_t::serialize(
+  graph_t<int32_t, int32_t, float, false, false> const& graph,
+  serializer_t::graph_meta_t<graph_t<int32_t, int32_t, float, false, false>>&);
+
+template void serializer_t::serialize(
+  graph_t<int32_t, int64_t, float, false, false> const& graph,
+  serializer_t::graph_meta_t<graph_t<int32_t, int64_t, float, false, false>>&);
+
+template void serializer_t::serialize(
+  graph_t<int64_t, int64_t, float, false, false> const& graph,
+  serializer_t::graph_meta_t<graph_t<int64_t, int64_t, float, false, false>>&);
+
+template void serializer_t::serialize(
+  graph_t<int32_t, int32_t, double, false, false> const& graph,
+  serializer_t::graph_meta_t<graph_t<int32_t, int32_t, double, false, false>>&);
+
+template void serializer_t::serialize(
+  graph_t<int32_t, int64_t, double, false, false> const& graph,
+  serializer_t::graph_meta_t<graph_t<int32_t, int64_t, double, false, false>>&);
+
+template void serializer_t::serialize(
+  graph_t<int64_t, int64_t, double, false, false> const& graph,
+  serializer_t::graph_meta_t<graph_t<int64_t, int64_t, double, false, false>>&);
+
+// unserialize graph:
+//
+template graph_t<int32_t, int32_t, float, false, false> serializer_t::unserialize(size_t, size_t);
+
+template graph_t<int32_t, int64_t, float, false, false> serializer_t::unserialize(size_t, size_t);
+
+template graph_t<int64_t, int64_t, float, false, false> serializer_t::unserialize(size_t, size_t);
+
+template graph_t<int32_t, int32_t, double, false, false> serializer_t::unserialize(size_t, size_t);
+
+template graph_t<int32_t, int64_t, double, false, false> serializer_t::unserialize(size_t, size_t);
+
+template graph_t<int64_t, int64_t, double, false, false> serializer_t::unserialize(size_t, size_t);
+
+}  // namespace serializer
+}  // namespace cugraph
diff --git a/cpp/src/sort/bitonic.cuh b/cpp/src/sort/bitonic.cuh
deleted file mode 100644
index e2922a58d39..00000000000
--- a/cpp/src/sort/bitonic.cuh
+++ /dev/null
@@ -1,546 +0,0 @@
-// -*-c++-*-
-
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Bitonic sort implementation
-// Author: Chuck Hastings charlesh@nvidia.com
-
-// TODO:  Read a paper (Hagen Peters 2011) that suggests some
-//        ways to optimize this.  Need to shift into a kernel
-//        and then organize to support multiple passes in
-//        a single kernel call.  This should reduce kernel
-//        launch overhead and the number of memory references,
-//        which should drive down the overall time.
-//
-
-#ifndef BITONIC_SORT_H
-#define BITONIC_SORT_H
-
-#include <thrust/for_each.h>
-#include <thrust/scan.h>
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
-
-namespace cugraph {
-namespace sort {
-
-namespace bitonic {
-/*
- *  This implementation is based upon the bitonic sort technique.
- *  This should be pretty efficient in a SIMT environment.
- */
-namespace detail {
-/**
- * @brief Compare two items, if the compare functor returns true
- *        then swap them.
- *
- * @param a - reference to the first item
- * @param b - reference to the second item
- * @param compare - reference to a comparison functor
- */
-template <typename ValueT, typename CompareT>
-inline void __device__ compareAndSwap(ValueT &a, ValueT &b, CompareT &compare)
-{
-  if (!compare(a, b)) { thrust::swap(a, b); }
-}
-
-/*
- * @brief perform repartitioning of two sorted partitions.  This
- *        is analagous to the bitonic merge step.  But it only
- *        performs the compare and swap portion of the bitonic
- *        merge.  The subsequent sorts are handled externally.
- *
- *        The repartition assumes that the data is segregated
- *        into partitions of binSize.  So if there are 8 elements
- *        and a bin size of 2 then the array will be partitioned
- *        into 4 bins of size 2.  Each bin is assumed to be
- *        sorted.  The repartition takes consecutive bins and
- *        repartitions them so that the first bin contains the
- *        low elements and the second bin contains the high elements.
- *
- * @param array - the array containing the data we need to repartition
- * @param count - the number of elements in the array
- * @param binSize - the size of the bin
- * @param compare - comparison functor
- */
-template <typename ValueT, typename CompareT>
-void repartition(ValueT *array, int count, int binSize, CompareT &compare)
-{
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(count / 2),
-
-                   [array, count, binSize, compare] __device__(int idx) {
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" but we'll fix
-                     //   special cases below.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = (bi_partition + 1) * (binSize * 2) - 1 - offset;
-
-                     //
-                     // The last partition pair is the problem.
-                     // There are several cases:
-                     //    1) Both partitions are full.  This
-                     //       is the easy case, we can just
-                     //       compare and swap elements
-                     //    2) First partition is full, the second
-                     //       partition is not full (possibly
-                     //       empty).  In this case, we only
-                     //       compare some of the elements.
-                     //    3) First partition is not full, there
-                     //       is no second partition.  In this
-                     //       case we actually don't have any
-                     //       work to do.
-                     //
-                     // This should be a simple check.  If the
-                     // second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.  Note that if the first
-                     // element is beyond the end of the array
-                     // there is also nothing to compare and swap,
-                     // but if the first element is beyond the
-                     // end of the array then the second element
-                     // will also be beyond the end of the array.
-                     //
-                     if (j < count) compareAndSwap(array[i], array[j], compare);
-                   });
-}
-
-/*
- * @brief perform shuffles.  After the repartition we need
- *        to perform shuffles of the halves to get things in
- *        order.
- *
- * @param array - the array containing the data we need to repartition
- * @param count - the number of elements in the array
- * @param binSize - the size of the bin
- * @param compare - comparison functor
- */
-template <typename ValueT, typename CompareT>
-void shuffles(ValueT *array, int count, int binSize, CompareT &compare)
-{
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>((count + 1) / 2),
-                   [array, count, binSize, compare] __device__(int idx) {
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" i + binSize.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = i + binSize;
-
-                     //
-                     // If the second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.
-                     //
-                     if (j < count) compareAndSwap(array[i], array[j], compare);
-                   });
-}
-
-/*
- * @brief perform repartitioning of two sorted partitions in the
- *        segmented sort case.
- *
- *        The repartition assumes that the data is segregated
- *        into partitions of binSize.  So if there are 8 elements
- *        and a bin size of 2 then the array will be partitioned
- *        into 4 bins of size 2.  Each bin is assumed to be
- *        sorted.  The repartition takes consecutive bins and
- *        repartitions them so that the first bin contains the
- *        low elements and the second bin contains the high elements.
- *
- * @param array - the array containing the data we need to repartition
- * @param count - the number of elements in the array
- * @param binSize - the size of the bin
- * @param compare - comparison functor
- */
-template <typename IndexT, typename ValueT, typename CompareT>
-void repartition_segmented(const IndexT *d_begin_offsets,
-                           const IndexT *d_end_offsets,
-                           ValueT *d_items,
-                           IndexT start,
-                           IndexT stop,
-                           IndexT *d_grouped_bins,
-                           int binSize,
-                           int max_count,
-                           int bin_pairs,
-                           CompareT &compare)
-{
-  thrust::for_each(thrust::device,
-                   thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(max_count / 2),
-                   [d_begin_offsets,
-                    d_end_offsets,
-                    d_items,
-                    start,
-                    stop,
-                    d_grouped_bins,
-                    bin_pairs,
-                    binSize,
-                    compare] __device__(int idx) {
-                     //
-                     //  idx needs to be mapped into the correct place
-                     //
-                     int entry     = idx / bin_pairs;
-                     int entry_idx = idx % bin_pairs;
-                     int base      = d_begin_offsets[d_grouped_bins[start + entry]];
-                     int count     = d_end_offsets[d_grouped_bins[start + entry]] - base;
-
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = entry_idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = entry_idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" but we'll fix
-                     //   special cases below.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = (bi_partition + 1) * (binSize * 2) - 1 - offset;
-
-                     //
-                     // The last partition pair is the problem.
-                     // There are several cases:
-                     //    1) Both partitions are full.  This
-                     //       is the easy case, we can just
-                     //       compare and swap elements
-                     //    2) First partition is full, the second
-                     //       partition is not full (possibly
-                     //       empty).  In this case, we only
-                     //       compare some of the elements.
-                     //    3) First partition is not full, there
-                     //       is no second partition.  In this
-                     //       case we actually don't have any
-                     //       work to do.
-                     //
-                     // This should be a simple check.  If the
-                     // second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.  Note that if the first
-                     // element is beyond the end of the array
-                     // there is also nothing to compare and swap,
-                     // but if the first element is beyond the
-                     // end of the array then the second element
-                     // will also be beyond the end of the array.
-                     //
-                     if (j < count) {
-                       compareAndSwap(d_items[base + i], d_items[base + j], compare);
-                     }
-                   });
-}
-
-/*
- * @brief perform shuffles.  After the repartition we need
- *        to perform shuffles of the halves to get things in
- *        order.
- *
- * @param rowOffsets - the row offsets identifying the segments
- * @param colIndices - the values to sort within the segments
- * @param start - position within the grouped bins where we
- *                start this pass
- * @param stop - position within the grouped bins where we stop
- *               this pass
- * @param d_grouped_bins - lrb grouped bins.  All bins between
- *                         start and stop are in the same lrb bin
- * @param binSize - the bitonic bin size for this pass of the shuffles
- * @param max_count - maximum number of elements possible for
- *                    this call
- * @param bin_pairs - the number of bin pairs
- * @param compare - the comparison functor
- */
-template <typename IndexT, typename ValueT, typename CompareT>
-void shuffles_segmented(const IndexT *d_begin_offsets,
-                        const IndexT *d_end_offsets,
-                        ValueT *d_items,
-                        IndexT start,
-                        IndexT stop,
-                        IndexT *d_grouped_bins,
-                        int binSize,
-                        long max_count,
-                        int bin_pairs,
-                        CompareT &compare)
-{
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(max_count / 2),
-                   [d_begin_offsets,
-                    d_end_offsets,
-                    d_items,
-                    start,
-                    stop,
-                    d_grouped_bins,
-                    compare,
-                    max_count,
-                    bin_pairs,
-                    binSize] __device__(int idx) {
-                     //
-                     //  idx needs to be mapped into the correct place
-                     //
-                     int entry     = idx / bin_pairs;
-                     int entry_idx = idx % bin_pairs;
-                     int base      = d_begin_offsets[d_grouped_bins[start + entry]];
-                     int count     = d_end_offsets[d_grouped_bins[start + entry]] - base;
-
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = entry_idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = entry_idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" i + binSize.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = i + binSize;
-
-                     //
-                     // If the second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.
-                     //
-                     if (j < count) compareAndSwap(d_items[base + i], d_items[base + j], compare);
-                   });
-}
-}  // namespace detail
-
-template <typename ValueT, typename CompareT>
-void sort(ValueT *array, int count, CompareT &compare)
-{
-  for (int i = 1; i < count; i *= 2) {
-    detail::repartition(array, count, i, compare);
-
-    for (int j = i / 2; j > 0; j /= 2) { detail::shuffles(array, count, j, compare); }
-  }
-}
-
-/**
- * @brief Perform a segmented sort.  This function performs a sort
- *        on each segment of the specified input.  This sort is done
- *        in place, so the d_items array is modified during this call.
- *        Sort is done according to the (optionally) specified
- *        comparison function.
- *
- *        Note that this function uses O(num_segments) temporary
- *        memory during execution.
- *
- * @param [in] num_segments - the number of segments that the items array is divided into
- * @param [in] num_items - the number of items in the array
- * @param [in] d_begin_offsets - device array containing the offset denoting the start
- *                               of each segment
- * @param [in] d_end_offsets - device array containing the offset denoting the end
- *                               of each segment.
- * @param [in/out] d_items - device array containing the items to sort
- * @param [in] compare - [optional] comparison function.  Default is thrust::less<ValueT>.
- * @param [in] stream - [optional] CUDA stream to launch kernels with.  Default is stream 0.
- *
- * @return error code
- */
-template <typename IndexT, typename ValueT, typename CompareT>
-void segmented_sort(IndexT num_segments,
-                    IndexT num_items,
-                    const IndexT *d_begin_offsets,
-                    const IndexT *d_end_offsets,
-                    ValueT *d_items,
-                    CompareT compare    = thrust::less<ValueT>(),
-                    cudaStream_t stream = nullptr)
-{
-  //
-  //  NOTE: This should probably be computed somehow.  At the moment
-  //        we are limited to 32 bits because of memory sizes.
-  //
-  int lrb_size = 32;
-  IndexT lrb[lrb_size + 1];
-
-  rmm::device_vector<IndexT> lrb_v(lrb_size + 1);
-  rmm::device_vector<IndexT> grouped_bins_v(num_segments + 1);
-
-  IndexT *d_lrb          = lrb_v.data().get();
-  IndexT *d_grouped_bins = grouped_bins_v.data().get();
-
-  CUDA_TRY(cudaMemset(d_lrb, 0, (lrb_size + 1) * sizeof(IndexT)));
-
-  //
-  //  First we'll count how many entries go in each bin
-  //
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(num_segments),
-                   [d_begin_offsets, d_end_offsets, d_lrb] __device__(int idx) {
-                     int size = d_end_offsets[idx] - d_begin_offsets[idx];
-                     //
-                     // NOTE: If size is 0 or 1 then no
-                     //       sorting is required, so we'll
-                     //       eliminate those bins here
-                     //
-                     if (size > 1) atomicAdd(d_lrb + __clz(size), 1);
-                   });
-
-  //
-  //  Exclusive sum will identify where each bin begins
-  //
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream)->on(stream), d_lrb, d_lrb + (lrb_size + 1), d_lrb);
-
-  //
-  //  Copy the start of each bin to local memory
-  //
-  CUDA_TRY(cudaMemcpy(lrb, d_lrb, (lrb_size + 1) * sizeof(IndexT), cudaMemcpyDeviceToHost));
-
-  //
-  //  Now we'll populate grouped_bins.  This will corrupt
-  //  d_lrb, but we've already copied it locally.
-  //
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(num_segments),
-                   [d_begin_offsets, d_end_offsets, d_lrb, d_grouped_bins] __device__(int idx) {
-                     int size = d_end_offsets[idx] - d_begin_offsets[idx];
-                     if (size > 1) {
-                       int pos             = atomicAdd(d_lrb + __clz(size), 1);
-                       d_grouped_bins[pos] = idx;
-                     }
-                   });
-
-  //
-  //  At this point, d_grouped_bins contains the index of the
-  //  different segments, ordered into log2 bins.
-  //
-
-  //
-  //  Now we're ready to go.
-  //
-  //  For simplicity (at least for now), let's just
-  //  iterate over each lrb bin.  Note that the larger
-  //  the index i, the smaller the size of each bin... but
-  //  there will likely be many more inhabitants of that bin.
-  //
-  for (int i = 0; i < lrb_size; ++i) {
-    int size = lrb[i + 1] - lrb[i];
-    if (size > 0) {
-      //
-      //  There are inhabitants of this lrb range
-      //
-      //  max_count will be used to drive the bitonic
-      //  passes (1, 2, 4, 8, ... up to max_count)
-      //
-      int max_count = 1 << (lrb_size - i);
-
-      for (int j = 1; j < max_count; j *= 2) {
-        detail::repartition_segmented(d_begin_offsets,
-                                      d_end_offsets,
-                                      d_items,
-                                      lrb[i],
-                                      lrb[i + 1],
-                                      d_grouped_bins,
-                                      j,
-                                      size * max_count,
-                                      max_count / 2,
-                                      compare);
-
-        for (int k = j / 2; k > 0; k /= 2) {
-          detail::shuffles_segmented(d_begin_offsets,
-                                     d_end_offsets,
-                                     d_items,
-                                     lrb[i],
-                                     lrb[i + 1],
-                                     d_grouped_bins,
-                                     k,
-                                     size * max_count,
-                                     max_count / 2,
-                                     compare);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bitonic
-}  // namespace sort
-}  // namespace cugraph
-
-#endif
diff --git a/cpp/src/structure/create_graph_from_edgelist.cpp b/cpp/src/structure/create_graph_from_edgelist.cpp
new file mode 100644
index 00000000000..6ce10c7ccdf
--- /dev/null
+++ b/cpp/src/structure/create_graph_from_edgelist.cpp
@@ -0,0 +1,485 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+
+#include <cstdint>
+#include <numeric>
+
+namespace cugraph {
+namespace experimental {
+
+namespace {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>>
+create_graph_from_edgelist_impl(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_local_vertex_span,
+  rmm::device_uvector<vertex_t>&& edgelist_rows,
+  rmm::device_uvector<vertex_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber)
+{
+  CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true.");
+
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+
+  auto edge_counts =
+    cugraph::detail::groupby_and_count_by_edge(handle,
+                                               store_transposed ? edgelist_cols : edgelist_rows,
+                                               store_transposed ? edgelist_rows : edgelist_cols,
+                                               edgelist_weights,
+                                               col_comm_size);
+
+  std::vector<size_t> h_edge_counts(edge_counts.size());
+  raft::update_host(
+    h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  std::vector<size_t> h_displacements(h_edge_counts.size(), size_t{0});
+  std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1);
+
+  // 3. renumber
+
+  rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
+  cugraph::experimental::partition_t<vertex_t> partition{};
+  vertex_t number_of_vertices{};
+  edge_t number_of_edges{};
+  auto segment_offsets = std::make_optional<std::vector<vertex_t>>(0);
+  {
+    std::vector<vertex_t*> major_ptrs(h_edge_counts.size());
+    std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
+    std::vector<edge_t> counts(major_ptrs.size());
+    for (size_t i = 0; i < h_edge_counts.size(); ++i) {
+      major_ptrs[i] =
+        (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + h_displacements[i];
+      minor_ptrs[i] =
+        (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + h_displacements[i];
+      counts[i] = static_cast<edge_t>(h_edge_counts[i]);
+    }
+    std::tie(
+      renumber_map_labels, partition, number_of_vertices, number_of_edges, *segment_offsets) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+        handle, optional_local_vertex_span, major_ptrs, minor_ptrs, counts);
+  }
+
+  // 4. create a graph
+
+  std::vector<cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelists(
+    h_edge_counts.size());
+  for (size_t i = 0; i < h_edge_counts.size(); ++i) {
+    edgelists[i] = cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>{
+      edgelist_rows.data() + h_displacements[i],
+      edgelist_cols.data() + h_displacements[i],
+      edgelist_weights
+        ? std::optional<weight_t const*>{(*edgelist_weights).data() + h_displacements[i]}
+        : std::nullopt,
+      static_cast<edge_t>(h_edge_counts[i])};
+  }
+
+  return std::make_tuple(
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      edgelists,
+      partition,
+      number_of_vertices,
+      number_of_edges,
+      graph_properties,
+      std::optional<std::vector<vertex_t>>{segment_offsets}),
+    std::optional<rmm::device_uvector<vertex_t>>{std::move(renumber_map_labels)});
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  !multi_gpu,
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>>
+create_graph_from_edgelist_impl(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+  rmm::device_uvector<vertex_t>&& edgelist_rows,
+  rmm::device_uvector<vertex_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber)
+{
+  auto renumber_map_labels =
+    renumber ? std::make_optional<rmm::device_uvector<vertex_t>>(0, handle.get_stream())
+             : std::nullopt;
+  std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+  if (renumber) {
+    segment_offsets = std::vector<vertex_t>{};
+    std::tie(*renumber_map_labels, *segment_offsets) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+        handle,
+        optional_vertex_span,
+        store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
+        store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
+        static_cast<edge_t>(edgelist_rows.size()));
+  }
+
+  vertex_t num_vertices{};
+  if (renumber) {
+    num_vertices = static_cast<vertex_t>((*renumber_map_labels).size());
+  } else {
+    if (optional_vertex_span) {
+      num_vertices = std::get<1>(*optional_vertex_span);
+    } else {
+      num_vertices = 1 + cugraph::detail::compute_maximum_vertex_id(
+                           handle.get_stream_view(), edgelist_rows, edgelist_cols);
+    }
+  }
+
+  return std::make_tuple(
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>{
+        edgelist_rows.data(),
+        edgelist_cols.data(),
+        edgelist_weights ? std::optional<weight_t const*>{(*edgelist_weights).data()}
+                         : std::nullopt,
+        static_cast<edge_t>(edgelist_rows.size())},
+      num_vertices,
+      graph_properties,
+      std::optional<std::vector<vertex_t>>{segment_offsets}),
+    std::move(renumber_map_labels));
+}
+
+}  // namespace
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+create_graph_from_edgelist(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<vertex_t const*, vertex_t>> optional_vertex_span,
+  rmm::device_uvector<vertex_t>&& edgelist_rows,
+  rmm::device_uvector<vertex_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber)
+{
+  return create_graph_from_edgelist_impl<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+    handle,
+    optional_vertex_span,
+    std::move(edgelist_rows),
+    std::move(edgelist_cols),
+    std::move(edgelist_weights),
+    graph_properties,
+    renumber);
+}
+
+// explicit instantiations
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, float, false, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, float, false, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, float, true, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, float, true, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, double, false, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, double, false, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, double, true, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int32_t, double, true, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, double, false, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+create_graph_from_edgelist<int32_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
+  rmm::device_uvector<int32_t>&& edgelist_rows,
+  rmm::device_uvector<int32_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, double, false, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+create_graph_from_edgelist<int64_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
+  rmm::device_uvector<int64_t>&& edgelist_rows,
+  rmm::device_uvector<int64_t>&& edgelist_cols,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  graph_properties_t graph_properties,
+  bool renumber);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 63ef725c3b7..e3bdd1d5c67 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,58 +14,60 @@
  * limitations under the License.
  */
 
-#include <graph.hpp>
-#include "utilities/error.hpp"
-#include "utilities/graph_utils.cuh"
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
 
 #include <raft/device_atomics.cuh>
+#include <rmm/exec_policy.hpp>
 
 namespace {
 
 template <typename vertex_t, typename edge_t>
 void degree_from_offsets(vertex_t number_of_vertices,
-                         edge_t const *offsets,
-                         edge_t *degree,
-                         cudaStream_t stream)
+                         edge_t const* offsets,
+                         edge_t* degree,
+                         rmm::cuda_stream_view stream_view)
 {
   // Computes out-degree for x = 0 and x = 2
   thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream_view),
     thrust::make_counting_iterator<vertex_t>(0),
     thrust::make_counting_iterator<vertex_t>(number_of_vertices),
     [offsets, degree] __device__(vertex_t v) { degree[v] = offsets[v + 1] - offsets[v]; });
 }
 
 template <typename vertex_t, typename edge_t>
-void degree_from_vertex_ids(const raft::handle_t *handle,
+void degree_from_vertex_ids(const raft::handle_t* handle,
                             vertex_t number_of_vertices,
                             edge_t number_of_edges,
-                            vertex_t const *indices,
-                            edge_t *degree,
-                            cudaStream_t stream)
+                            vertex_t const* indices,
+                            edge_t* degree,
+                            rmm::cuda_stream_view stream_view)
 {
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(stream_view),
                    thrust::make_counting_iterator<edge_t>(0),
                    thrust::make_counting_iterator<edge_t>(number_of_edges),
                    [indices, degree] __device__(edge_t e) { atomicAdd(degree + indices[e], 1); });
   if ((handle != nullptr) && (handle->comms_initialized())) {
-    auto &comm = handle->get_comms();
-    comm.allreduce(degree, degree, number_of_vertices, raft::comms::op_t::SUM, stream);
+    auto& comm = handle->get_comms();
+    comm.allreduce(degree, degree, number_of_vertices, raft::comms::op_t::SUM, stream_view.value());
   }
 }
 
 }  // namespace
 
 namespace cugraph {
+namespace legacy {
 
 template <typename VT, typename ET, typename WT>
-void GraphViewBase<VT, ET, WT>::get_vertex_identifiers(VT *identifiers) const
+void GraphViewBase<VT, ET, WT>::get_vertex_identifiers(VT* identifiers) const
 {
   cugraph::detail::sequence<VT>(number_of_vertices, identifiers);
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCompressedSparseBaseView<VT, ET, WT>::get_source_indices(VT *src_indices) const
+void GraphCompressedSparseBaseView<VT, ET, WT>::get_source_indices(VT* src_indices) const
 {
   CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified");
   cugraph::detail::offsets_to_indices<VT>(
@@ -73,7 +75,7 @@ void GraphCompressedSparseBaseView<VT, ET, WT>::get_source_indices(VT *src_indic
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCOOView<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const
+void GraphCOOView<VT, ET, WT>::degree(ET* degree, DegreeDirection direction) const
 {
   //
   // NOTE:  We assume offsets/indices are a CSR.  If a CSC is passed
@@ -110,7 +112,7 @@ void GraphCOOView<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) con
 }
 
 template <typename VT, typename ET, typename WT>
-void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET *degree, DegreeDirection direction) const
+void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET* degree, DegreeDirection direction) const
 {
   //
   // NOTE:  We assume offsets/indices are a CSR.  If a CSC is passed
@@ -118,7 +120,7 @@ void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET *degree, DegreeDirecti
   //        (e.g. if you have a CSC and you want in-degree (x=1) then pass
   //        the offsets/indices and request an out-degree (x=2))
   //
-  cudaStream_t stream{nullptr};
+  rmm::cuda_stream_view stream_view;
 
   if (direction != DegreeDirection::IN) {
     if ((GraphViewBase<VT, ET, WT>::handle != nullptr) &&
@@ -127,7 +129,8 @@ void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET *degree, DegreeDirecti
                                                                  // source indexing for
                                                                  // the allreduce to work
     }
-    degree_from_offsets(GraphViewBase<VT, ET, WT>::number_of_vertices, offsets, degree, stream);
+    degree_from_offsets(
+      GraphViewBase<VT, ET, WT>::number_of_vertices, offsets, degree, stream_view);
   }
 
   if (direction != DegreeDirection::OUT) {
@@ -136,7 +139,7 @@ void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET *degree, DegreeDirecti
                            GraphViewBase<VT, ET, WT>::number_of_edges,
                            indices,
                            degree,
-                           stream);
+                           stream_view);
   }
 }
 
@@ -147,4 +150,7 @@ template class GraphCOOView<int32_t, int32_t, float>;
 template class GraphCOOView<int32_t, int32_t, double>;
 template class GraphCompressedSparseBaseView<int32_t, int32_t, float>;
 template class GraphCompressedSparseBaseView<int32_t, int32_t, double>;
+}  // namespace legacy
 }  // namespace cugraph
+
+#include <utilities/eidir_graph_utils.hpp>
diff --git a/cpp/src/traversal/README.md b/cpp/src/traversal/README.md
new file mode 100644
index 00000000000..429b58d441e
--- /dev/null
+++ b/cpp/src/traversal/README.md
@@ -0,0 +1,56 @@
+# Traversal
+cuGraph traversal algorithms are contained in this directory
+
+## SSSP
+
+The unit test code is the best place to search for examples on calling SSSP.
+
+ * [SG Implementation](../../tests/experimental/sssp_test.cpp)
+ * MG Implementation - TBD
+
+## Simple SSSP
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the distances and predecessors vectors in device memory and pass in the raw pointers to those vectors into the SSSP function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+using result_t = weight_t;      // could specify float or double also
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+vertex_t source;                // Initialized by user
+
+rmm::device_uvector<weight_t> distances_v(graph_view.get_number_of_vertices(), handle.get_stream());
+rmm::device_uvector<vertex_t> predecessors_v(graph_view.get_number_of_vertices(), handle.get_stream());
+
+cugraph::experimental::sssp(handle, graph_view, distances_v.begin(), predecessors_v.begin(), source, std::numeric_limits<weight_t>::max(), false);
+```
+
+## BFS
+
+The unit test code is the best place to search for examples on calling BFS.
+
+ * [SG Implementation](../../tests/experimental/bfs_test.cpp)
+ * MG Implementation - TBD
+
+## Simple BFS
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the distances and predecessors vectors in device memory and pass in the raw pointers to those vectors into the BFS function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+using result_t = weight_t;      // could specify float or double also
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+vertex_t source;                // Initialized by user
+
+rmm::device_uvector<weight_t> distances_v(graph_view.get_number_of_vertices(), handle.get_stream());
+rmm::device_uvector<vertex_t> predecessors_v(graph_view.get_number_of_vertices(), handle.get_stream());
+
+cugraph::experimental::bfs(handle, graph_view, d_distances.begin(), d_predecessors.begin(), source, false, std::numeric_limits<vertex_t>::max(), false);
+```
diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu
index 7c59010cab8..74a94ba0670 100644
--- a/cpp/src/traversal/bfs.cu
+++ b/cpp/src/traversal/bfs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -14,14 +14,14 @@
 #include <limits>
 #include "bfs.cuh"
 
-#include "graph.hpp"
+#include <cugraph/legacy/graph.hpp>
 
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
 #include "bfs_kernels.cuh"
 #include "mg/bfs.cuh"
 #include "mg/common_utils.cuh"
 #include "traversal_common.cuh"
-#include "utilities/graph_utils.cuh"
 
 namespace cugraph {
 namespace detail {
@@ -96,7 +96,7 @@ void BFS<IndexType>::setup()
   // Lets use this int* for the next 3 lines
   // Its dereferenced value is not initialized - so we dont care about what we
   // put in it
-  IndexType *d_nisolated = d_new_frontier_cnt;
+  IndexType* d_nisolated = d_new_frontier_cnt;
   cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
 
   // Computing isolated_bmap
@@ -114,10 +114,10 @@ void BFS<IndexType>::setup()
 }
 
 template <typename IndexType>
-void BFS<IndexType>::configure(IndexType *_distances,
-                               IndexType *_predecessors,
-                               double *_sp_counters,
-                               int *_edge_mask)
+void BFS<IndexType>::configure(IndexType* _distances,
+                               IndexType* _predecessors,
+                               double* _sp_counters,
+                               int* _edge_mask)
 {
   distances    = _distances;
   predecessors = _predecessors;
@@ -473,11 +473,11 @@ template class BFS<int64_t>;
 // NOTE: SP counter increase extremely fast on large graph
 //       It can easily reach 1e40~1e70 on GAP-road.mtx
 template <typename VT, typename ET, typename WT>
-void bfs(raft::handle_t const &handle,
-         GraphCSRView<VT, ET, WT> const &graph,
-         VT *distances,
-         VT *predecessors,
-         double *sp_counters,
+void bfs(raft::handle_t const& handle,
+         legacy::GraphCSRView<VT, ET, WT> const& graph,
+         VT* distances,
+         VT* predecessors,
+         double* sp_counters,
          const VT start_vertex,
          bool directed,
          bool mg_batch)
@@ -497,8 +497,8 @@ void bfs(raft::handle_t const &handle,
     VT number_of_vertices = graph.number_of_vertices;
     ET number_of_edges    = graph.number_of_edges;
 
-    const VT *indices_ptr = graph.indices;
-    const ET *offsets_ptr = graph.offsets;
+    const VT* indices_ptr = graph.indices;
+    const ET* offsets_ptr = graph.offsets;
 
     int alpha = 15;
     int beta  = 18;
@@ -511,63 +511,69 @@ void bfs(raft::handle_t const &handle,
 }
 
 // Explicit Instantiation
-template void bfs<uint32_t, uint32_t, float>(raft::handle_t const &handle,
-                                             GraphCSRView<uint32_t, uint32_t, float> const &graph,
-                                             uint32_t *distances,
-                                             uint32_t *predecessors,
-                                             double *sp_counters,
-                                             const uint32_t source_vertex,
-                                             bool directed,
-                                             bool mg_batch);
+template void bfs<uint32_t, uint32_t, float>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<uint32_t, uint32_t, float> const& graph,
+  uint32_t* distances,
+  uint32_t* predecessors,
+  double* sp_counters,
+  const uint32_t source_vertex,
+  bool directed,
+  bool mg_batch);
 
 // Explicit Instantiation
-template void bfs<uint32_t, uint32_t, double>(raft::handle_t const &handle,
-                                              GraphCSRView<uint32_t, uint32_t, double> const &graph,
-                                              uint32_t *distances,
-                                              uint32_t *predecessors,
-                                              double *sp_counters,
-                                              const uint32_t source_vertex,
-                                              bool directed,
-                                              bool mg_batch);
+template void bfs<uint32_t, uint32_t, double>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<uint32_t, uint32_t, double> const& graph,
+  uint32_t* distances,
+  uint32_t* predecessors,
+  double* sp_counters,
+  const uint32_t source_vertex,
+  bool directed,
+  bool mg_batch);
 
 // Explicit Instantiation
-template void bfs<int32_t, int32_t, float>(raft::handle_t const &handle,
-                                           GraphCSRView<int32_t, int32_t, float> const &graph,
-                                           int32_t *distances,
-                                           int32_t *predecessors,
-                                           double *sp_counters,
-                                           const int32_t source_vertex,
-                                           bool directed,
-                                           bool mg_batch);
+template void bfs<int32_t, int32_t, float>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<int32_t, int32_t, float> const& graph,
+  int32_t* distances,
+  int32_t* predecessors,
+  double* sp_counters,
+  const int32_t source_vertex,
+  bool directed,
+  bool mg_batch);
 
 // Explicit Instantiation
-template void bfs<int32_t, int32_t, double>(raft::handle_t const &handle,
-                                            GraphCSRView<int32_t, int32_t, double> const &graph,
-                                            int32_t *distances,
-                                            int32_t *predecessors,
-                                            double *sp_counters,
-                                            const int32_t source_vertex,
-                                            bool directed,
-                                            bool mg_batch);
+template void bfs<int32_t, int32_t, double>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<int32_t, int32_t, double> const& graph,
+  int32_t* distances,
+  int32_t* predecessors,
+  double* sp_counters,
+  const int32_t source_vertex,
+  bool directed,
+  bool mg_batch);
 
 // Explicit Instantiation
-template void bfs<int64_t, int64_t, float>(raft::handle_t const &handle,
-                                           GraphCSRView<int64_t, int64_t, float> const &graph,
-                                           int64_t *distances,
-                                           int64_t *predecessors,
-                                           double *sp_counters,
-                                           const int64_t source_vertex,
-                                           bool directed,
-                                           bool mg_batch);
+template void bfs<int64_t, int64_t, float>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<int64_t, int64_t, float> const& graph,
+  int64_t* distances,
+  int64_t* predecessors,
+  double* sp_counters,
+  const int64_t source_vertex,
+  bool directed,
+  bool mg_batch);
 
 // Explicit Instantiation
-template void bfs<int64_t, int64_t, double>(raft::handle_t const &handle,
-                                            GraphCSRView<int64_t, int64_t, double> const &graph,
-                                            int64_t *distances,
-                                            int64_t *predecessors,
-                                            double *sp_counters,
-                                            const int64_t source_vertex,
-                                            bool directed,
-                                            bool mg_batch);
+template void bfs<int64_t, int64_t, double>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<int64_t, int64_t, double> const& graph,
+  int64_t* distances,
+  int64_t* predecessors,
+  double* sp_counters,
+  const int64_t source_vertex,
+  bool directed,
+  bool mg_batch);
 
 }  // namespace cugraph
diff --git a/cpp/src/traversal/bfs.cuh b/cpp/src/traversal/bfs.cuh
index 6457665ec09..6bf8e0d0197 100644
--- a/cpp/src/traversal/bfs.cuh
+++ b/cpp/src/traversal/bfs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -25,8 +25,8 @@ template <typename IndexType>
 class BFS {
  private:
   IndexType number_of_vertices, number_of_edges;
-  const IndexType *row_offsets = nullptr;
-  const IndexType *col_indices = nullptr;
+  const IndexType* row_offsets = nullptr;
+  const IndexType* col_indices = nullptr;
 
   bool directed;
   bool deterministic;
@@ -36,10 +36,10 @@ class BFS {
   bool computeDistances;
   bool computePredecessors;
   rmm::device_vector<IndexType> distances_vals;
-  IndexType *distances    = nullptr;
-  IndexType *predecessors = nullptr;
-  double *sp_counters     = nullptr;
-  int *edge_mask          = nullptr;
+  IndexType* distances    = nullptr;
+  IndexType* predecessors = nullptr;
+  double* sp_counters     = nullptr;
+  int* edge_mask          = nullptr;
 
   rmm::device_vector<IndexType> original_frontier;
   rmm::device_vector<int> visited_bmap;
@@ -53,16 +53,16 @@ class BFS {
   // Working data
   // For complete description of each, go to bfs.cu
   IndexType nisolated;
-  IndexType *frontier                             = nullptr;
-  IndexType *new_frontier                         = nullptr;
-  IndexType *frontier_vertex_degree               = nullptr;
-  IndexType *exclusive_sum_frontier_vertex_degree = nullptr;
-  IndexType *unvisited_queue                      = nullptr;
-  IndexType *left_unvisited_queue                 = nullptr;
-  IndexType *d_new_frontier_cnt                   = nullptr;
-  IndexType *d_mu                                 = nullptr;
-  IndexType *d_unvisited_cnt                      = nullptr;
-  IndexType *d_left_unvisited_cnt                 = nullptr;
+  IndexType* frontier                             = nullptr;
+  IndexType* new_frontier                         = nullptr;
+  IndexType* frontier_vertex_degree               = nullptr;
+  IndexType* exclusive_sum_frontier_vertex_degree = nullptr;
+  IndexType* unvisited_queue                      = nullptr;
+  IndexType* left_unvisited_queue                 = nullptr;
+  IndexType* d_new_frontier_cnt                   = nullptr;
+  IndexType* d_mu                                 = nullptr;
+  IndexType* d_unvisited_cnt                      = nullptr;
+  IndexType* d_left_unvisited_cnt                 = nullptr;
 
   IndexType vertices_bmap_size;
 
@@ -80,8 +80,8 @@ class BFS {
 
   BFS(IndexType _number_of_vertices,
       IndexType _number_of_edges,
-      const IndexType *_row_offsets,
-      const IndexType *_col_indices,
+      const IndexType* _row_offsets,
+      const IndexType* _col_indices,
       bool _directed,
       IndexType _alpha,
       IndexType _beta,
@@ -98,10 +98,10 @@ class BFS {
     setup();
   }
 
-  void configure(IndexType *distances,
-                 IndexType *predecessors,
-                 double *sp_counters,
-                 int *edge_mask);
+  void configure(IndexType* distances,
+                 IndexType* predecessors,
+                 double* sp_counters,
+                 int* edge_mask);
 
   void traverse(IndexType source_vertex);
 };
diff --git a/cpp/src/traversal/bfs_kernels.cuh b/cpp/src/traversal/bfs_kernels.cuh
index bf2ec2fc6ee..4e482b446ba 100644
--- a/cpp/src/traversal/bfs_kernels.cuh
+++ b/cpp/src/traversal/bfs_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <cub/cub.cuh>
 
-#include "graph.hpp"
+#include <cugraph/legacy/graph.hpp>
 #include "traversal_common.cuh"
 
 namespace cugraph {
@@ -40,11 +40,11 @@ namespace bfs_kernels {
 // visited_bmap_nints = the visited_bmap is made of that number of ints
 
 template <typename IndexType>
-__global__ void fill_unvisited_queue_kernel(int *visited_bmap,
+__global__ void fill_unvisited_queue_kernel(int* visited_bmap,
                                             IndexType visited_bmap_nints,
                                             IndexType n,
-                                            IndexType *unvisited,
-                                            IndexType *unvisited_cnt)
+                                            IndexType* unvisited,
+                                            IndexType* unvisited_cnt)
 {
   typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
   __shared__ typename BlockScan::TempStorage scan_temp_storage;
@@ -118,8 +118,8 @@ __global__ void fill_unvisited_queue_kernel(int *visited_bmap,
         vec_v.z = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
         vec_v.w = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
 
-        typename traversal::vec_t<IndexType>::vec4 *unvisited_i4 =
-          reinterpret_cast<typename traversal::vec_t<IndexType>::vec4 *>(
+        typename traversal::vec_t<IndexType>::vec4* unvisited_i4 =
+          reinterpret_cast<typename traversal::vec_t<IndexType>::vec4*>(
             &unvisited[current_unvisited_index]);
         *unvisited_i4 = vec_v;
 
@@ -131,8 +131,8 @@ __global__ void fill_unvisited_queue_kernel(int *visited_bmap,
         vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
         vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
 
-        typename traversal::vec_t<IndexType>::vec2 *unvisited_i2 =
-          reinterpret_cast<typename traversal::vec_t<IndexType>::vec2 *>(
+        typename traversal::vec_t<IndexType>::vec2* unvisited_i2 =
+          reinterpret_cast<typename traversal::vec_t<IndexType>::vec2*>(
             &unvisited[current_unvisited_index]);
         *unvisited_i2 = vec_v;
 
@@ -152,11 +152,11 @@ __global__ void fill_unvisited_queue_kernel(int *visited_bmap,
 
 // Wrapper
 template <typename IndexType>
-void fill_unvisited_queue(int *visited_bmap,
+void fill_unvisited_queue(int* visited_bmap,
                           IndexType visited_bmap_nints,
                           IndexType n,
-                          IndexType *unvisited,
-                          IndexType *unvisited_cnt,
+                          IndexType* unvisited,
+                          IndexType* unvisited_cnt,
                           cudaStream_t m_stream,
                           bool deterministic)
 {
@@ -181,11 +181,11 @@ void fill_unvisited_queue(int *visited_bmap,
 //
 
 template <typename IndexType>
-__global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
+__global__ void count_unvisited_edges_kernel(const IndexType* potentially_unvisited,
                                              const IndexType potentially_unvisited_size,
-                                             const int *visited_bmap,
-                                             IndexType *degree_vertices,
-                                             IndexType *mu)
+                                             const int* visited_bmap,
+                                             IndexType* degree_vertices,
+                                             IndexType* mu)
 {
   typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
   __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
@@ -214,11 +214,11 @@ __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisi
 
 // Wrapper
 template <typename IndexType>
-void count_unvisited_edges(const IndexType *potentially_unvisited,
+void count_unvisited_edges(const IndexType* potentially_unvisited,
                            const IndexType potentially_unvisited_size,
-                           const int *visited_bmap,
-                           IndexType *node_degree,
-                           IndexType *mu,
+                           const int* visited_bmap,
+                           IndexType* node_degree,
+                           IndexType* mu,
                            cudaStream_t m_stream)
 {
   dim3 grid, block;
@@ -246,19 +246,19 @@ void count_unvisited_edges(const IndexType *potentially_unvisited,
 //
 
 template <typename IndexType>
-__global__ void main_bottomup_kernel(const IndexType *unvisited,
+__global__ void main_bottomup_kernel(const IndexType* unvisited,
                                      const IndexType unvisited_size,
-                                     IndexType *left_unvisited,
-                                     IndexType *left_unvisited_cnt,
-                                     int *visited_bmap,
-                                     const IndexType *row_ptr,
-                                     const IndexType *col_ind,
+                                     IndexType* left_unvisited,
+                                     IndexType* left_unvisited_cnt,
+                                     int* visited_bmap,
+                                     const IndexType* row_ptr,
+                                     const IndexType* col_ind,
                                      IndexType lvl,
-                                     IndexType *new_frontier,
-                                     IndexType *new_frontier_cnt,
-                                     IndexType *distances,
-                                     IndexType *predecessors,
-                                     int *edge_mask)
+                                     IndexType* new_frontier,
+                                     IndexType* new_frontier_cnt,
+                                     IndexType* distances,
+                                     IndexType* predecessors,
+                                     int* edge_mask)
 {
   typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
   typedef cub::WarpReduce<int> WarpReduce;
@@ -292,7 +292,7 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited,
   // When this kernel is converted to support different VT and ET, this
   // will likely split into invalid_vid and invalid_eid
   // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::invalid_idx<IndexType>::value;
+  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
 
   // we will call __syncthreads inside the loop
   // we need to keep complete block active
@@ -487,19 +487,19 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited,
 }
 
 template <typename IndexType>
-void bottom_up_main(IndexType *unvisited,
+void bottom_up_main(IndexType* unvisited,
                     IndexType unvisited_size,
-                    IndexType *left_unvisited,
-                    IndexType *d_left_unvisited_idx,
-                    int *visited,
-                    const IndexType *row_ptr,
-                    const IndexType *col_ind,
+                    IndexType* left_unvisited,
+                    IndexType* d_left_unvisited_idx,
+                    int* visited,
+                    const IndexType* row_ptr,
+                    const IndexType* col_ind,
                     IndexType lvl,
-                    IndexType *new_frontier,
-                    IndexType *new_frontier_idx,
-                    IndexType *distances,
-                    IndexType *predecessors,
-                    int *edge_mask,
+                    IndexType* new_frontier,
+                    IndexType* new_frontier_idx,
+                    IndexType* distances,
+                    IndexType* predecessors,
+                    int* edge_mask,
                     cudaStream_t m_stream,
                     bool deterministic)
 {
@@ -531,17 +531,17 @@ void bottom_up_main(IndexType *unvisited,
 // MAIN_BOTTOMUP_MAX_EDGES && no parent found
 //
 template <typename IndexType>
-__global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited,
+__global__ void bottom_up_large_degree_kernel(IndexType* left_unvisited,
                                               IndexType left_unvisited_size,
-                                              int *visited,
-                                              const IndexType *row_ptr,
-                                              const IndexType *col_ind,
+                                              int* visited,
+                                              const IndexType* row_ptr,
+                                              const IndexType* col_ind,
                                               IndexType lvl,
-                                              IndexType *new_frontier,
-                                              IndexType *new_frontier_cnt,
-                                              IndexType *distances,
-                                              IndexType *predecessors,
-                                              int *edge_mask)
+                                              IndexType* new_frontier,
+                                              IndexType* new_frontier_cnt,
+                                              IndexType* distances,
+                                              IndexType* predecessors,
+                                              int* edge_mask)
 {
   int logical_lane_id         = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
   int logical_warp_id         = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
@@ -550,7 +550,7 @@ __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited,
   // When this kernel is converted to support different VT and ET, this
   // will likely split into invalid_vid and invalid_eid
   // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::invalid_idx<IndexType>::value;
+  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
 
   // Inactive threads are not a pb for __ballot (known behaviour)
   for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
@@ -610,17 +610,17 @@ __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited,
 }
 
 template <typename IndexType>
-void bottom_up_large(IndexType *left_unvisited,
+void bottom_up_large(IndexType* left_unvisited,
                      IndexType left_unvisited_size,
-                     int *visited,
-                     const IndexType *row_ptr,
-                     const IndexType *col_ind,
+                     int* visited,
+                     const IndexType* row_ptr,
+                     const IndexType* col_ind,
                      IndexType lvl,
-                     IndexType *new_frontier,
-                     IndexType *new_frontier_idx,
-                     IndexType *distances,
-                     IndexType *predecessors,
-                     int *edge_mask,
+                     IndexType* new_frontier,
+                     IndexType* new_frontier_idx,
+                     IndexType* distances,
+                     IndexType* predecessors,
+                     int* edge_mask,
                      cudaStream_t m_stream,
                      bool deterministic)
 {
@@ -680,24 +680,24 @@ void bottom_up_large(IndexType *left_unvisited,
 
 template <typename IndexType>
 __global__ void topdown_expand_kernel(
-  const IndexType *row_ptr,
-  const IndexType *col_ind,
-  const IndexType *frontier,
+  const IndexType* row_ptr,
+  const IndexType* col_ind,
+  const IndexType* frontier,
   const IndexType frontier_size,
   const IndexType totaldegree,
   const IndexType max_items_per_thread,
   const IndexType lvl,
-  IndexType *new_frontier,
-  IndexType *new_frontier_cnt,
-  const IndexType *frontier_degrees_exclusive_sum,
-  const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
-  int *previous_bmap,
-  int *bmap,
-  IndexType *distances,
-  IndexType *predecessors,
-  double *sp_counters,
-  const int *edge_mask,
-  const int *isolated_bmap,
+  IndexType* new_frontier,
+  IndexType* new_frontier_cnt,
+  const IndexType* frontier_degrees_exclusive_sum,
+  const IndexType* frontier_degrees_exclusive_sum_buckets_offsets,
+  int* previous_bmap,
+  int* bmap,
+  IndexType* distances,
+  IndexType* predecessors,
+  double* sp_counters,
+  const int* edge_mask,
+  const int* isolated_bmap,
   bool directed)
 {
   // BlockScan
@@ -728,7 +728,7 @@ __global__ void topdown_expand_kernel(
   // When this kernel is converted to support different VT and ET, this
   // will likely split into invalid_vid and invalid_eid
   // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::invalid_idx<IndexType>::value;
+  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
 
   IndexType n_items_per_thread_left =
     (totaldegree > block_offset)
@@ -844,7 +844,7 @@ __global__ void topdown_expand_kernel(
         IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
         IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
 
-        IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+        IndexType* vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
 
 #pragma unroll
         for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
@@ -869,7 +869,7 @@ __global__ void topdown_expand_kernel(
           }
         }
 
-        IndexType *vec_row_ptr_u = &local_buf1[0];
+        IndexType* vec_row_ptr_u = &local_buf1[0];
 #pragma unroll
         for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
           IndexType u = vec_u[iv];
@@ -878,7 +878,7 @@ __global__ void topdown_expand_kernel(
         }
 
         // We won't need row_ptr after that, reusing pointer
-        IndexType *vec_dest_v = vec_row_ptr_u;
+        IndexType* vec_dest_v = vec_row_ptr_u;
 
 #pragma unroll
         for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
@@ -901,7 +901,7 @@ __global__ void topdown_expand_kernel(
         }
 
         // We don't need vec_frontier_degrees_exclusive_sum_index anymore
-        IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+        IndexType* vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
 
         // Visited bmap need to contain information about the previous
         // frontier if we actually process every edge (shortest path counting)
@@ -916,7 +916,7 @@ __global__ void topdown_expand_kernel(
         // From now on we will consider v as a frontier candidate
         // If for some reason vec_candidate[iv] should be put in the
         // new_frontier Then set vec_candidate[iv] = -1
-        IndexType *vec_frontier_candidate = vec_dest_v;
+        IndexType* vec_frontier_candidate = vec_dest_v;
 
 #pragma unroll
         for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
@@ -943,7 +943,7 @@ __global__ void topdown_expand_kernel(
 
         if (directed) {
           // vec_v_visited_bmap is available
-          IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
+          IndexType* vec_is_isolated_bmap = vec_v_visited_bmap;
 
 #pragma unroll
           for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
@@ -1021,7 +1021,7 @@ __global__ void topdown_expand_kernel(
 
         IndexType naccepted_vertices = 0;
         // We won't need vec_frontier_candidate after that
-        IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
+        IndexType* vec_frontier_accepted_vertex = vec_frontier_candidate;
 
 #pragma unroll
         for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
@@ -1094,23 +1094,23 @@ __global__ void topdown_expand_kernel(
 }
 
 template <typename IndexType>
-void frontier_expand(const IndexType *row_ptr,
-                     const IndexType *col_ind,
-                     const IndexType *frontier,
+void frontier_expand(const IndexType* row_ptr,
+                     const IndexType* col_ind,
+                     const IndexType* frontier,
                      const IndexType frontier_size,
                      const IndexType totaldegree,
                      const IndexType lvl,
-                     IndexType *new_frontier,
-                     IndexType *new_frontier_cnt,
-                     const IndexType *frontier_degrees_exclusive_sum,
-                     const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
-                     int *previous_visited_bmap,
-                     int *visited_bmap,
-                     IndexType *distances,
-                     IndexType *predecessors,
-                     double *sp_counters,
-                     const int *edge_mask,
-                     const int *isolated_bmap,
+                     IndexType* new_frontier,
+                     IndexType* new_frontier_cnt,
+                     const IndexType* frontier_degrees_exclusive_sum,
+                     const IndexType* frontier_degrees_exclusive_sum_buckets_offsets,
+                     int* previous_visited_bmap,
+                     int* visited_bmap,
+                     IndexType* distances,
+                     IndexType* predecessors,
+                     double* sp_counters,
+                     const int* edge_mask,
+                     const int* isolated_bmap,
                      bool directed,
                      cudaStream_t m_stream,
                      bool deterministic)
diff --git a/cpp/src/traversal/mg/bfs.cuh b/cpp/src/traversal/mg/bfs.cuh
index b053a6ff75a..e6c8c3bf700 100644
--- a/cpp/src/traversal/mg/bfs.cuh
+++ b/cpp/src/traversal/mg/bfs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,12 +28,12 @@ namespace mg {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename operator_t>
-void bfs_traverse(raft::handle_t const &handle,
-                  cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void bfs_traverse(raft::handle_t const& handle,
+                  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
                   const vertex_t start_vertex,
-                  rmm::device_vector<uint32_t> &visited_bmap,
-                  rmm::device_vector<uint32_t> &output_frontier_bmap,
-                  operator_t &bfs_op)
+                  rmm::device_vector<uint32_t>& visited_bmap,
+                  rmm::device_vector<uint32_t>& output_frontier_bmap,
+                  operator_t& bfs_op)
 {
   // Frontiers required for BFS
   rmm::device_vector<vertex_t> input_frontier(graph.number_of_vertices);
@@ -47,8 +47,8 @@ void bfs_traverse(raft::handle_t const &handle,
 
   // Reusing buffers to create isolated bitmap
   {
-    rmm::device_vector<vertex_t> &local_isolated_ids  = input_frontier;
-    rmm::device_vector<vertex_t> &global_isolated_ids = output_frontier;
+    rmm::device_vector<vertex_t>& local_isolated_ids  = input_frontier;
+    rmm::device_vector<vertex_t>& global_isolated_ids = output_frontier;
     detail::create_isolated_bitmap(
       handle, graph, local_isolated_ids, global_isolated_ids, temp_buffer_len, isolated_bmap);
   }
@@ -110,10 +110,10 @@ void bfs_traverse(raft::handle_t const &handle,
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void bfs(raft::handle_t const &handle,
-         cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-         vertex_t *distances,
-         vertex_t *predecessors,
+void bfs(raft::handle_t const& handle,
+         cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+         vertex_t* distances,
+         vertex_t* predecessors,
          const vertex_t start_vertex)
 {
   CUGRAPH_EXPECTS(handle.comms_initialized(),
@@ -132,7 +132,7 @@ void bfs(raft::handle_t const &handle,
   thrust::fill(rmm::exec_policy(stream)->on(stream),
                predecessors,
                predecessors + global_number_of_vertices,
-               cugraph::invalid_idx<vertex_t>::value);
+               cugraph::legacy::invalid_idx<vertex_t>::value);
 
   if (distances == nullptr) {
     detail::BFSStepNoDist<vertex_t, edge_t> bfs_op(
diff --git a/cpp/src/traversal/mg/common_utils.cuh b/cpp/src/traversal/mg/common_utils.cuh
index 2cda827b471..9a95aba7901 100644
--- a/cpp/src/traversal/mg/common_utils.cuh
+++ b/cpp/src/traversal/mg/common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,15 @@
 
 #pragma once
 
-#include <raft/integer_utils.h>
+#include "../traversal_common.cuh"
+
 #include <rmm/thrust_rmm_allocator.h>
+#include <thrust/host_vector.h>
 #include <cub/cub.cuh>
-#include "../traversal_common.cuh"
+
+#include <raft/cudart_utils.h>
+#include <raft/integer_utils.h>
+#include <raft/handle.hpp>
 
 namespace cugraph {
 
@@ -41,25 +46,25 @@ constexpr inline return_t number_of_words(return_t number_of_bits)
 
 template <typename edge_t>
 struct isDegreeZero {
-  edge_t const *offset_;
-  isDegreeZero(edge_t const *offset) : offset_(offset) {}
+  edge_t const* offset_;
+  isDegreeZero(edge_t const* offset) : offset_(offset) {}
 
-  __device__ bool operator()(const edge_t &id) const { return (offset_[id + 1] == offset_[id]); }
+  __device__ bool operator()(const edge_t& id) const { return (offset_[id + 1] == offset_[id]); }
 };
 
 struct set_nth_bit {
-  uint32_t *bmap_;
-  set_nth_bit(uint32_t *bmap) : bmap_(bmap) {}
+  uint32_t* bmap_;
+  set_nth_bit(uint32_t* bmap) : bmap_(bmap) {}
 
   template <typename return_t>
-  __device__ void operator()(const return_t &id)
+  __device__ void operator()(const return_t& id)
   {
     atomicOr(bmap_ + (id / BitsPWrd<uint32_t>), (uint32_t{1} << (id % BitsPWrd<uint32_t>)));
   }
 };
 
 template <typename vertex_t>
-bool is_vertex_isolated(rmm::device_vector<uint32_t> &bmap, vertex_t id)
+bool is_vertex_isolated(rmm::device_vector<uint32_t>& bmap, vertex_t id)
 {
   uint32_t word       = bmap[id / BitsPWrd<uint32_t>];
   uint32_t active_bit = static_cast<uint32_t>(1) << (id % BitsPWrd<uint32_t>);
@@ -69,11 +74,11 @@ bool is_vertex_isolated(rmm::device_vector<uint32_t> &bmap, vertex_t id)
 
 template <typename vertex_t, typename edge_t>
 struct BFSStepNoDist {
-  uint32_t *output_frontier_;
-  uint32_t *visited_;
-  vertex_t *predecessors_;
+  uint32_t* output_frontier_;
+  uint32_t* visited_;
+  vertex_t* predecessors_;
 
-  BFSStepNoDist(uint32_t *output_frontier, uint32_t *visited, vertex_t *predecessors)
+  BFSStepNoDist(uint32_t* output_frontier, uint32_t* visited, vertex_t* predecessors)
     : output_frontier_(output_frontier), visited_(visited), predecessors_(predecessors)
   {
   }
@@ -100,13 +105,13 @@ struct BFSStepNoDist {
 
 template <typename vertex_t, typename edge_t>
 struct BFSStep {
-  uint32_t *output_frontier_;
-  uint32_t *visited_;
-  vertex_t *predecessors_;
-  vertex_t *distances_;
+  uint32_t* output_frontier_;
+  uint32_t* visited_;
+  vertex_t* predecessors_;
+  vertex_t* distances_;
   vertex_t level_;
 
-  BFSStep(uint32_t *output_frontier, uint32_t *visited, vertex_t *predecessors, vertex_t *distances)
+  BFSStep(uint32_t* output_frontier, uint32_t* visited, vertex_t* predecessors, vertex_t* distances)
     : output_frontier_(output_frontier),
       visited_(visited),
       predecessors_(predecessors),
@@ -136,9 +141,10 @@ struct BFSStep {
 };
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-vertex_t populate_isolated_vertices(raft::handle_t const &handle,
-                                    cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                    rmm::device_vector<vertex_t> &isolated_vertex_ids)
+vertex_t populate_isolated_vertices(
+  raft::handle_t const& handle,
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  rmm::device_vector<vertex_t>& isolated_vertex_ids)
 {
   bool is_mg          = (handle.comms_initialized() && (graph.local_vertices != nullptr) &&
                 (graph.local_offsets != nullptr));
@@ -164,11 +170,11 @@ vertex_t populate_isolated_vertices(raft::handle_t const &handle,
 }
 
 template <typename return_t>
-return_t collect_vectors(raft::handle_t const &handle,
-                         rmm::device_vector<size_t> &buffer_len,
-                         rmm::device_vector<return_t> &local,
+return_t collect_vectors(raft::handle_t const& handle,
+                         rmm::device_vector<size_t>& buffer_len,
+                         rmm::device_vector<return_t>& local,
                          return_t local_count,
-                         rmm::device_vector<return_t> &global)
+                         rmm::device_vector<return_t>& global)
 {
   CHECK_CUDA(handle.get_stream());
   buffer_len.resize(handle.get_comms().get_size());
@@ -200,9 +206,9 @@ return_t collect_vectors(raft::handle_t const &handle,
 }
 
 template <typename return_t>
-void add_to_bitmap(raft::handle_t const &handle,
-                   rmm::device_vector<uint32_t> &bmap,
-                   rmm::device_vector<return_t> &id,
+void add_to_bitmap(raft::handle_t const& handle,
+                   rmm::device_vector<uint32_t>& bmap,
+                   rmm::device_vector<return_t>& id,
                    return_t count)
 {
   cudaStream_t stream = handle.get_stream();
@@ -216,12 +222,12 @@ void add_to_bitmap(raft::handle_t const &handle,
 // For all vertex ids i which are isolated (out degree is 0), set
 // ith bit of isolated_bmap to 1
 template <typename vertex_t, typename edge_t, typename weight_t>
-void create_isolated_bitmap(raft::handle_t const &handle,
-                            cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                            rmm::device_vector<vertex_t> &local_isolated_ids,
-                            rmm::device_vector<vertex_t> &global_isolated_ids,
-                            rmm::device_vector<size_t> &temp_buffer_len,
-                            rmm::device_vector<uint32_t> &isolated_bmap)
+void create_isolated_bitmap(raft::handle_t const& handle,
+                            cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                            rmm::device_vector<vertex_t>& local_isolated_ids,
+                            rmm::device_vector<vertex_t>& global_isolated_ids,
+                            rmm::device_vector<size_t>& temp_buffer_len,
+                            rmm::device_vector<uint32_t>& isolated_bmap)
 {
   size_t word_count = detail::number_of_words(graph.number_of_vertices);
   local_isolated_ids.resize(graph.number_of_vertices);
@@ -236,8 +242,8 @@ void create_isolated_bitmap(raft::handle_t const &handle,
 }
 
 template <typename return_t>
-return_t remove_duplicates(raft::handle_t const &handle,
-                           rmm::device_vector<return_t> &data,
+return_t remove_duplicates(raft::handle_t const& handle,
+                           rmm::device_vector<return_t>& data,
                            return_t data_len)
 {
   cudaStream_t stream = handle.get_stream();
@@ -253,13 +259,13 @@ return_t remove_duplicates(raft::handle_t const &handle,
 // ids. bmap is expected to be of the length
 // id_end/BitsPWrd<uint32_t> and is set to 0 initially
 template <uint32_t BLOCK_SIZE, typename return_t>
-__global__ void remove_duplicates_kernel(uint32_t *bmap,
-                                         return_t *in_id,
+__global__ void remove_duplicates_kernel(uint32_t* bmap,
+                                         return_t* in_id,
                                          return_t id_begin,
                                          return_t id_end,
                                          return_t count,
-                                         return_t *out_id,
-                                         return_t *out_count)
+                                         return_t* out_id,
+                                         return_t* out_count)
 {
   return_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   return_t id;
@@ -301,14 +307,14 @@ __global__ void remove_duplicates_kernel(uint32_t *bmap,
 }
 
 template <uint32_t BLOCK_SIZE, typename return_t>
-__global__ void remove_duplicates_kernel(uint32_t *bmap,
-                                         uint32_t *isolated_bmap,
-                                         return_t *in_id,
+__global__ void remove_duplicates_kernel(uint32_t* bmap,
+                                         uint32_t* isolated_bmap,
+                                         return_t* in_id,
                                          return_t id_begin,
                                          return_t id_end,
                                          return_t count,
-                                         return_t *out_id,
-                                         return_t *out_count)
+                                         return_t* out_id,
+                                         return_t* out_count)
 {
   return_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   return_t id;
@@ -354,13 +360,13 @@ __global__ void remove_duplicates_kernel(uint32_t *bmap,
 }
 
 template <typename return_t>
-return_t remove_duplicates(raft::handle_t const &handle,
-                           rmm::device_vector<uint32_t> &bmap,
-                           rmm::device_vector<return_t> &data,
+return_t remove_duplicates(raft::handle_t const& handle,
+                           rmm::device_vector<uint32_t>& bmap,
+                           rmm::device_vector<return_t>& data,
                            return_t data_len,
                            return_t data_begin,
                            return_t data_end,
-                           rmm::device_vector<return_t> &out_data)
+                           rmm::device_vector<return_t>& out_data)
 {
   cudaStream_t stream = handle.get_stream();
 
@@ -382,13 +388,14 @@ return_t remove_duplicates(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-vertex_t preprocess_input_frontier(raft::handle_t const &handle,
-                                   cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                   rmm::device_vector<uint32_t> &bmap,
-                                   rmm::device_vector<uint32_t> &isolated_bmap,
-                                   rmm::device_vector<vertex_t> &input_frontier,
-                                   vertex_t input_frontier_len,
-                                   rmm::device_vector<vertex_t> &output_frontier)
+vertex_t preprocess_input_frontier(
+  raft::handle_t const& handle,
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  rmm::device_vector<uint32_t>& bmap,
+  rmm::device_vector<uint32_t>& isolated_bmap,
+  rmm::device_vector<vertex_t>& input_frontier,
+  vertex_t input_frontier_len,
+  rmm::device_vector<vertex_t>& output_frontier)
 {
   cudaStream_t stream = handle.get_stream();
 
@@ -414,12 +421,13 @@ vertex_t preprocess_input_frontier(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-vertex_t preprocess_input_frontier(raft::handle_t const &handle,
-                                   cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                                   rmm::device_vector<uint32_t> &bmap,
-                                   rmm::device_vector<vertex_t> &input_frontier,
-                                   vertex_t input_frontier_len,
-                                   rmm::device_vector<vertex_t> &output_frontier)
+vertex_t preprocess_input_frontier(
+  raft::handle_t const& handle,
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  rmm::device_vector<uint32_t>& bmap,
+  rmm::device_vector<vertex_t>& input_frontier,
+  vertex_t input_frontier_len,
+  rmm::device_vector<vertex_t>& output_frontier)
 {
   cudaStream_t stream = handle.get_stream();
 
@@ -444,7 +452,7 @@ vertex_t preprocess_input_frontier(raft::handle_t const &handle,
 }
 
 template <typename vertex_t>
-__global__ void fill_kernel(vertex_t *distances, vertex_t count, vertex_t start_vertex)
+__global__ void fill_kernel(vertex_t* distances, vertex_t count, vertex_t start_vertex)
 {
   vertex_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid >= count) { return; }
@@ -456,11 +464,11 @@ __global__ void fill_kernel(vertex_t *distances, vertex_t count, vertex_t start_
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void fill_max_dist(raft::handle_t const &handle,
-                   cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void fill_max_dist(raft::handle_t const& handle,
+                   cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
                    vertex_t start_vertex,
                    vertex_t global_number_of_vertices,
-                   vertex_t *distances)
+                   vertex_t* distances)
 {
   if (distances == nullptr) { return; }
   vertex_t array_size        = global_number_of_vertices;
@@ -470,8 +478,9 @@ void fill_max_dist(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-vertex_t get_global_vertex_count(raft::handle_t const &handle,
-                                 cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph)
+vertex_t get_global_vertex_count(
+  raft::handle_t const& handle,
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph)
 {
   rmm::device_vector<vertex_t> id(1);
   id[0] = *thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
diff --git a/cpp/src/traversal/mg/frontier_expand.cuh b/cpp/src/traversal/mg/frontier_expand.cuh
index 2733c319087..078ab085724 100644
--- a/cpp/src/traversal/mg/frontier_expand.cuh
+++ b/cpp/src/traversal/mg/frontier_expand.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <graph.hpp>
+#include <cugraph/legacy/graph.hpp>
 #include "frontier_expand_kernels.cuh"
 #include "vertex_binning.cuh"
 
@@ -28,8 +28,8 @@ namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
 class FrontierExpand {
-  raft::handle_t const &handle_;
-  cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph_;
+  raft::handle_t const& handle_;
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph_;
   VertexBinner<vertex_t, edge_t> dist_;
   rmm::device_vector<vertex_t> reorganized_vertices_;
   edge_t vertex_begin_;
@@ -37,8 +37,8 @@ class FrontierExpand {
   rmm::device_vector<edge_t> output_vertex_count_;
 
  public:
-  FrontierExpand(raft::handle_t const &handle,
-                 cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph)
+  FrontierExpand(raft::handle_t const& handle,
+                 cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph)
     : handle_(handle), graph_(graph)
   {
     bool is_mg = (handle.comms_initialized() && (graph.local_vertices != nullptr) &&
@@ -59,9 +59,9 @@ class FrontierExpand {
   // Return the size of the output_frontier
   template <typename operator_t>
   vertex_t operator()(operator_t op,
-                      rmm::device_vector<vertex_t> &input_frontier,
+                      rmm::device_vector<vertex_t>& input_frontier,
                       vertex_t input_frontier_len,
-                      rmm::device_vector<vertex_t> &output_frontier)
+                      rmm::device_vector<vertex_t>& output_frontier)
   {
     if (input_frontier_len == 0) { return static_cast<vertex_t>(0); }
     cudaStream_t stream     = handle_.get_stream();
diff --git a/cpp/src/traversal/mg/frontier_expand_kernels.cuh b/cpp/src/traversal/mg/frontier_expand_kernels.cuh
index 625ec0d956f..32b9310f02f 100644
--- a/cpp/src/traversal/mg/frontier_expand_kernels.cuh
+++ b/cpp/src/traversal/mg/frontier_expand_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <graph.hpp>
+#include <cugraph/legacy/graph.hpp>
 #include "vertex_binning.cuh"
 
 namespace cugraph {
@@ -26,13 +26,13 @@ namespace mg {
 namespace detail {
 
 template <typename vertex_t, typename edge_t>
-__device__ void write_to_frontier(vertex_t const *thread_frontier,
+__device__ void write_to_frontier(vertex_t const* thread_frontier,
                                   int thread_frontier_count,
-                                  vertex_t *block_frontier,
-                                  int *block_frontier_count,
-                                  vertex_t *output_frontier,
-                                  edge_t *block_write_offset,
-                                  edge_t *output_frontier_count)
+                                  vertex_t* block_frontier,
+                                  int* block_frontier_count,
+                                  vertex_t* output_frontier,
+                                  edge_t* block_write_offset,
+                                  edge_t* output_frontier_count)
 {
   // Set frontier count for block to 0
   if (threadIdx.x == 0) { *block_frontier_count = 0; }
@@ -66,13 +66,13 @@ template <int BlockSize,
           typename vertex_t,
           typename edge_t,
           typename operator_t>
-__global__ void block_per_vertex(edge_t const *offsets,
-                                 vertex_t const *indices,
-                                 vertex_t const *input_frontier,
+__global__ void block_per_vertex(edge_t const* offsets,
+                                 vertex_t const* indices,
+                                 vertex_t const* input_frontier,
                                  vertex_t input_frontier_count,
                                  vertex_t vertex_begin,
-                                 vertex_t *output_frontier,
-                                 edge_t *output_frontier_count,
+                                 vertex_t* output_frontier,
+                                 edge_t* output_frontier_count,
                                  operator_t op)
 {
   if (blockIdx.x >= input_frontier_count) { return; }
@@ -121,13 +121,13 @@ template <int BlockSize,
           typename vertex_t,
           typename edge_t,
           typename operator_t>
-__global__ void kernel_per_vertex(edge_t const *offsets,
-                                  vertex_t const *indices,
-                                  vertex_t const *input_frontier,
+__global__ void kernel_per_vertex(edge_t const* offsets,
+                                  vertex_t const* indices,
+                                  vertex_t const* input_frontier,
                                   vertex_t input_frontier_count,
                                   vertex_t vertex_begin,
-                                  vertex_t *output_frontier,
-                                  edge_t *output_frontier_count,
+                                  vertex_t* output_frontier,
+                                  edge_t* output_frontier_count,
                                   operator_t op)
 {
   vertex_t current_vertex_index = 0;
@@ -171,12 +171,12 @@ __global__ void kernel_per_vertex(edge_t const *offsets,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename operator_t>
-void large_vertex_lb(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                     DegreeBucket<vertex_t, edge_t> &bucket,
+void large_vertex_lb(cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                     DegreeBucket<vertex_t, edge_t>& bucket,
                      operator_t op,
                      vertex_t vertex_begin,
-                     vertex_t *output_vertex_ids,
-                     edge_t *output_vertex_ids_offset,
+                     vertex_t* output_vertex_ids,
+                     edge_t* output_vertex_ids_offset,
                      cudaStream_t stream)
 {
   if (bucket.numberOfVertices != 0) {
@@ -196,12 +196,12 @@ void large_vertex_lb(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &gr
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename operator_t>
-void medium_vertex_lb(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                      DegreeBucket<vertex_t, edge_t> &bucket,
+void medium_vertex_lb(cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                      DegreeBucket<vertex_t, edge_t>& bucket,
                       operator_t op,
                       vertex_t vertex_begin,
-                      vertex_t *output_vertex_ids,
-                      edge_t *output_vertex_ids_offset,
+                      vertex_t* output_vertex_ids,
+                      edge_t* output_vertex_ids_offset,
                       cudaStream_t stream)
 {
   // Vertices with degrees 2^12 <= d < 2^16 are handled by this kernel
@@ -223,12 +223,12 @@ void medium_vertex_lb(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &g
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename operator_t>
-void small_vertex_lb(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                     DegreeBucket<vertex_t, edge_t> &bucket,
+void small_vertex_lb(cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+                     DegreeBucket<vertex_t, edge_t>& bucket,
                      operator_t op,
                      vertex_t vertex_begin,
-                     vertex_t *output_vertex_ids,
-                     edge_t *output_vertex_ids_offset,
+                     vertex_t* output_vertex_ids,
+                     edge_t* output_vertex_ids_offset,
                      cudaStream_t stream)
 {
   int block_count = bucket.numberOfVertices;
diff --git a/cpp/src/traversal/mg/vertex_binning.cuh b/cpp/src/traversal/mg/vertex_binning.cuh
index 3d8c963c466..b4ed881a06e 100644
--- a/cpp/src/traversal/mg/vertex_binning.cuh
+++ b/cpp/src/traversal/mg/vertex_binning.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include "common_utils.cuh"
 #include "vertex_binning_kernels.cuh"
 
+#include <thrust/host_vector.h>
+
 namespace cugraph {
 
 namespace mg {
diff --git a/cpp/src/traversal/mg/vertex_binning_kernels.cuh b/cpp/src/traversal/mg/vertex_binning_kernels.cuh
index dbb339fea05..57574965a3a 100644
--- a/cpp/src/traversal/mg/vertex_binning_kernels.cuh
+++ b/cpp/src/traversal/mg/vertex_binning_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ __device__ inline typename std::enable_if<(sizeof(degree_t) == 8), int>::type ce
 }
 
 template <typename return_t>
-__global__ void simple_fill(return_t *bin0, return_t *bin1, return_t count)
+__global__ void simple_fill(return_t* bin0, return_t* bin1, return_t count)
 {
   for (return_t i = 0; i < count; i++) {
     bin0[i] = 0;
@@ -49,7 +49,7 @@ __global__ void simple_fill(return_t *bin0, return_t *bin1, return_t count)
 }
 
 template <typename return_t>
-__global__ void exclusive_scan(return_t *data, return_t *out)
+__global__ void exclusive_scan(return_t* data, return_t* out)
 {
   constexpr int BinCount = NumberBins<return_t>;
   return_t lData[BinCount];
@@ -71,9 +71,9 @@ __global__ void exclusive_scan(return_t *data, return_t *out)
 // In this function, any id in vertex_ids array is only acceptable as long
 // as its value is between vertex_begin and vertex_end
 template <typename vertex_t, typename edge_t>
-__global__ void count_bin_sizes(edge_t *bins,
-                                edge_t const *offsets,
-                                vertex_t const *vertex_ids,
+__global__ void count_bin_sizes(edge_t* bins,
+                                edge_t const* offsets,
+                                vertex_t const* vertex_ids,
                                 edge_t const vertex_id_count,
                                 vertex_t vertex_begin,
                                 vertex_t vertex_end)
@@ -81,7 +81,9 @@ __global__ void count_bin_sizes(edge_t *bins,
   using cugraph::detail::traversal::atomicAdd;
   constexpr int BinCount = NumberBins<edge_t>;
   __shared__ edge_t lBin[BinCount];
-  for (int i = threadIdx.x; i < BinCount; i += blockDim.x) { lBin[i] = 0; }
+  for (int i = threadIdx.x; i < BinCount; i += blockDim.x) {
+    lBin[i] = 0;
+  }
   __syncthreads();
 
   for (vertex_t i = threadIdx.x + (blockIdx.x * blockDim.x); i < vertex_id_count;
@@ -98,16 +100,18 @@ __global__ void count_bin_sizes(edge_t *bins,
   }
   __syncthreads();
 
-  for (int i = threadIdx.x; i < BinCount; i += blockDim.x) { atomicAdd(bins + i, lBin[i]); }
+  for (int i = threadIdx.x; i < BinCount; i += blockDim.x) {
+    atomicAdd(bins + i, lBin[i]);
+  }
 }
 
 // Bin vertices to the appropriate bins by taking into account
 // the starting offsets calculated by count_bin_sizes
 template <typename vertex_t, typename edge_t>
-__global__ void create_vertex_bins(vertex_t *out_vertex_ids,
-                                   edge_t *bin_offsets,
-                                   edge_t const *offsets,
-                                   vertex_t *in_vertex_ids,
+__global__ void create_vertex_bins(vertex_t* out_vertex_ids,
+                                   edge_t* bin_offsets,
+                                   edge_t const* offsets,
+                                   vertex_t* in_vertex_ids,
                                    edge_t const vertex_id_count,
                                    vertex_t vertex_begin,
                                    vertex_t vertex_end)
@@ -149,12 +153,12 @@ __global__ void create_vertex_bins(vertex_t *out_vertex_ids,
 }
 
 template <typename vertex_t, typename edge_t>
-void bin_vertices(rmm::device_vector<vertex_t> &input_vertex_ids,
+void bin_vertices(rmm::device_vector<vertex_t>& input_vertex_ids,
                   vertex_t input_vertex_ids_len,
-                  rmm::device_vector<vertex_t> &reorganized_vertex_ids,
-                  rmm::device_vector<edge_t> &bin_count_offsets,
-                  rmm::device_vector<edge_t> &bin_count,
-                  edge_t *offsets,
+                  rmm::device_vector<vertex_t>& reorganized_vertex_ids,
+                  rmm::device_vector<edge_t>& bin_count_offsets,
+                  rmm::device_vector<edge_t>& bin_count,
+                  edge_t* offsets,
                   vertex_t vertex_begin,
                   vertex_t vertex_end,
                   cudaStream_t stream)
diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu
index 4018c9d9878..61225dd7fd6 100644
--- a/cpp/src/traversal/sssp.cu
+++ b/cpp/src/traversal/sssp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 // Author: Prasun Gera pgera@nvidia.com
 
 #include <algorithm>
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
 
-#include "graph.hpp"
+#include <cugraph/legacy/graph.hpp>
 
 #include "sssp.cuh"
 #include "sssp_kernels.cuh"
@@ -47,11 +47,11 @@ void SSSP<IndexType, DistType>::setup()
 
   // Allocate buffer for data that need to be reset every iteration
   iter_buffer_size = sizeof(int) * (edges_bmap_size + vertices_bmap_size) + sizeof(IndexType);
-  iter_buffer.resize(iter_buffer_size);
+  iter_buffer.resize(iter_buffer_size, stream);
   // ith bit of relaxed_edges_bmap <=> ith edge was relaxed
-  relaxed_edges_bmap = static_cast<int *>(iter_buffer.data());
+  relaxed_edges_bmap = static_cast<int*>(iter_buffer.data());
   // ith bit of next_frontier_bmap <=> vertex is active in the next frontier
-  next_frontier_bmap = static_cast<int *>(iter_buffer.data()) + edges_bmap_size;
+  next_frontier_bmap = static_cast<int*>(iter_buffer.data()) + edges_bmap_size;
   // num vertices in the next frontier
   d_new_frontier_cnt = next_frontier_bmap + vertices_bmap_size;
 
@@ -73,7 +73,7 @@ void SSSP<IndexType, DistType>::setup()
   exclusive_sum_frontier_vertex_buckets_offsets.resize(bucket_off_size);
 
   // Repurpose d_new_frontier_cnt temporarily
-  IndexType *d_nisolated = d_new_frontier_cnt;
+  IndexType* d_nisolated = d_new_frontier_cnt;
   cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
 
   // Computing isolated_bmap
@@ -89,9 +89,9 @@ void SSSP<IndexType, DistType>::setup()
 }
 
 template <typename IndexType, typename DistType>
-void SSSP<IndexType, DistType>::configure(DistType *_distances,
-                                          IndexType *_predecessors,
-                                          int *_edge_mask)
+void SSSP<IndexType, DistType>::configure(DistType* _distances,
+                                          IndexType* _predecessors,
+                                          int* _edge_mask)
 {
   distances    = _distances;
   predecessors = _predecessors;
@@ -242,12 +242,12 @@ void SSSP<IndexType, DistType>::clean()
  * @file sssp.cu
  * --------------------------------------------------------------------------*/
 template <typename VT, typename ET, typename WT>
-void sssp(GraphCSRView<VT, ET, WT> const &graph,
-          WT *distances,
-          VT *predecessors,
+void sssp(legacy::GraphCSRView<VT, ET, WT> const& graph,
+          WT* distances,
+          VT* predecessors,
           const VT source_vertex)
 {
-  CUGRAPH_EXPECTS(distances || predecessors, "Invalid API parameter, both outputs are nullptr");
+  CUGRAPH_EXPECTS(distances || predecessors, "Invalid input argument, both outputs are nullptr");
 
   if (typeid(VT) != typeid(int)) CUGRAPH_FAIL("Unsupported vertex id data type, please use int");
   if (typeid(ET) != typeid(int)) CUGRAPH_FAIL("Unsupported edge id data type, please use int");
@@ -257,9 +257,9 @@ void sssp(GraphCSRView<VT, ET, WT> const &graph,
   int num_vertices = graph.number_of_vertices;
   int num_edges    = graph.number_of_edges;
 
-  const ET *offsets_ptr      = graph.offsets;
-  const VT *indices_ptr      = graph.indices;
-  const WT *edge_weights_ptr = nullptr;
+  const ET* offsets_ptr      = graph.offsets;
+  const VT* indices_ptr      = graph.indices;
+  const WT* edge_weights_ptr = nullptr;
 
   // Both if / else branch operate own calls due to
   // thrust::device_vector lifetime
@@ -281,7 +281,7 @@ void sssp(GraphCSRView<VT, ET, WT> const &graph,
   } else {
     // SSSP is not defined for graphs with negative weight cycles
     // Warn user about any negative edges
-    if (graph.prop.has_negative_edges == PropType::PROP_TRUE)
+    if (graph.prop.has_negative_edges == legacy::PropType::PROP_TRUE)
       std::cerr << "WARN: The graph has negative weight edges. SSSP will not "
                    "converge if the graph has negative weight cycles\n";
     edge_weights_ptr = graph.edge_data;
@@ -293,13 +293,13 @@ void sssp(GraphCSRView<VT, ET, WT> const &graph,
 }
 
 // explicit instantiation
-template void sssp<int, int, float>(GraphCSRView<int, int, float> const &graph,
-                                    float *distances,
-                                    int *predecessors,
+template void sssp<int, int, float>(legacy::GraphCSRView<int, int, float> const& graph,
+                                    float* distances,
+                                    int* predecessors,
                                     const int source_vertex);
-template void sssp<int, int, double>(GraphCSRView<int, int, double> const &graph,
-                                     double *distances,
-                                     int *predecessors,
+template void sssp<int, int, double>(legacy::GraphCSRView<int, int, double> const& graph,
+                                     double* distances,
+                                     int* predecessors,
                                      const int source_vertex);
 
 }  // namespace cugraph
diff --git a/cpp/src/traversal/sssp_kernels.cuh b/cpp/src/traversal/sssp_kernels.cuh
index d96540b22b9..d1cf9980773 100644
--- a/cpp/src/traversal/sssp_kernels.cuh
+++ b/cpp/src/traversal/sssp_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <iostream>
 
 #include <cub/cub.cuh>
+#include <cugraph/utilities/error.hpp>
 #include "traversal_common.cuh"
-#include "utilities/error.hpp"
 namespace cugraph {
 namespace detail {
 namespace sssp_kernels {
diff --git a/cpp/src/traversal/traversal_common.cuh b/cpp/src/traversal/traversal_common.cuh
index 2802fb94be8..ea77173870e 100644
--- a/cpp/src/traversal/traversal_common.cuh
+++ b/cpp/src/traversal/traversal_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include "utilities/error.hpp"
+#include <cugraph/utilities/error.hpp>
 
 #define MAXBLOCKS 65535
 #define WARP_SIZE 32
-#define INT_SIZE 32
+#define INT_SIZE  32
 
 //
 // Bottom up macros
@@ -31,7 +31,7 @@
 
 #define COUNT_UNVISITED_EDGES_DIMX 256
 
-#define MAIN_BOTTOMUP_DIMX 256
+#define MAIN_BOTTOMUP_DIMX   256
 #define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX / WARP_SIZE)
 
 #define LARGE_BOTTOMUP_DIMX 256
diff --git a/cpp/src/traversal/tsp.cu b/cpp/src/traversal/tsp.cu
new file mode 100644
index 00000000000..9be4f4f3767
--- /dev/null
+++ b/cpp/src/traversal/tsp.cu
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2014-2020, Texas State University. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <raft/spatial/knn/knn.hpp>
+
+#include <utilities/high_res_timer.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include "tsp.hpp"
+#include "tsp_solver.hpp"
+
+namespace cugraph {
+namespace detail {
+
+TSP::TSP(raft::handle_t const& handle,
+         int const* vtx_ptr,
+         float const* x_pos,
+         float const* y_pos,
+         int nodes,
+         int restarts,
+         bool beam_search,
+         int k,
+         int nstart,
+         bool verbose,
+         int* route)
+  : handle_(handle),
+    vtx_ptr_(vtx_ptr),
+    x_pos_(x_pos),
+    y_pos_(y_pos),
+    nodes_(nodes),
+    restarts_(restarts),
+    beam_search_(beam_search),
+    k_(k),
+    nstart_(nstart),
+    verbose_(verbose),
+    route_(route),
+    stream_(handle_.get_stream()),
+    max_blocks_(handle_.get_device_properties().maxGridSize[0]),
+    max_threads_(handle_.get_device_properties().maxThreadsPerBlock),
+    warp_size_(handle_.get_device_properties().warpSize),
+    sm_count_(handle_.get_device_properties().multiProcessorCount),
+    restart_batch_(8192),
+    mylock_scalar_(stream_),
+    best_cost_scalar_(stream_),
+    neighbors_vec_((k_ + 1) * nodes_, stream_),
+    work_vec_(restart_batch_ * ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_),
+              stream_),
+    best_x_pos_vec_(1, stream_),
+    best_y_pos_vec_(1, stream_),
+    best_route_vec_(1, stream_)
+{
+  setup();
+}
+
+void TSP::setup()
+{
+  mylock_ = mylock_scalar_.data();
+
+  neighbors_ = neighbors_vec_.data();
+  // pre-allocate workspace for climbs, each block needs a separate permutation space and search
+  // buffer. We allocate a work buffer that will store the computed distances, px, py and the route.
+  // We align it on the warp size.
+  work_ = work_vec_.data();
+
+  results_.best_x_pos = best_x_pos_vec_.data();
+  results_.best_y_pos = best_y_pos_vec_.data();
+  results_.best_route = best_route_vec_.data();
+  results_.best_cost  = best_cost_scalar_.data();
+}
+
+void TSP::reset_batch()
+{
+  mylock_scalar_.set_value_to_zero_async(stream_);
+  auto const max{std::numeric_limits<int>::max()};
+  best_cost_scalar_.set_value_async(max, stream_);
+}
+
+void TSP::get_initial_solution(int const batch)
+{
+  if (!beam_search_) {
+    random_init<<<restart_batch_, best_thread_num_>>>(
+      work_, x_pos_, y_pos_, vtx_ptr_, nstart_, nodes_, batch, restart_batch_);
+    CHECK_CUDA(stream_);
+  } else {
+    knn_init<<<restart_batch_, best_thread_num_>>>(
+      work_, x_pos_, y_pos_, vtx_ptr_, neighbors_, nstart_, nodes_, k_, batch, restart_batch_);
+    CHECK_CUDA(stream_);
+  }
+}
+
+float TSP::compute()
+{
+  float final_cost        = 0.f;
+  int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
+  int restart_resid       = restarts_ - (num_restart_batches - 1) * restart_batch_;
+  int global_best         = std::numeric_limits<int>::max();
+  int best                = 0;
+
+  std::vector<float> h_x_pos;
+  std::vector<float> h_y_pos;
+  std::vector<int> h_route;
+  h_x_pos.reserve(nodes_ + 1);
+  h_y_pos.reserve(nodes_ + 1);
+  h_route.reserve(nodes_);
+  std::vector<float*> addr_best_x_pos(1);
+  std::vector<float*> addr_best_y_pos(1);
+  std::vector<int*> addr_best_route(1);
+  HighResTimer hr_timer;
+  auto create_timer = [&hr_timer, this](char const* name) {
+    return VerboseTimer(name, hr_timer, verbose_);
+  };
+
+  if (verbose_) {
+    std::cout << "Doing " << num_restart_batches << " batches of size " << restart_batch_
+              << ", with " << restart_resid << " tail\n";
+    std::cout << "configuration: " << nodes_ << " nodes, " << restarts_ << " restart\n";
+    std::cout << "optimizing graph with kswap = " << kswaps << "\n";
+  }
+
+  // Tell the cache how we want it to behave
+  cudaFuncSetCacheConfig(search_solution, cudaFuncCachePreferEqual);
+  best_thread_num_ = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
+
+  if (verbose_) std::cout << "Calculated best thread number = " << best_thread_num_ << "\n";
+
+  if (beam_search_) {
+    auto timer = create_timer("knn");
+    knn();
+  }
+
+  for (auto batch = 0; batch < num_restart_batches; ++batch) {
+    reset_batch();
+    if (batch == num_restart_batches - 1) restart_batch_ = restart_resid;
+
+    {
+      auto timer = create_timer("initial_sol");
+      get_initial_solution(batch);
+    }
+
+    {
+      auto timer = create_timer("search_sol");
+      search_solution<<<restart_batch_,
+                        best_thread_num_,
+                        sizeof(int) * best_thread_num_,
+                        stream_>>>(
+        results_, mylock_, vtx_ptr_, beam_search_, k_, nodes_, x_pos_, y_pos_, work_, nstart_);
+      CHECK_CUDA(stream_);
+    }
+
+    {
+      auto timer = create_timer("optimal_tour");
+      get_optimal_tour<<<restart_batch_,
+                         best_thread_num_,
+                         sizeof(int) * best_thread_num_,
+                         stream_>>>(results_, mylock_, work_, nodes_);
+      CHECK_CUDA(stream_);
+    }
+
+    cudaDeviceSynchronize();
+    best = best_cost_scalar_.value(stream_);
+
+    if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";
+
+    if (best < global_best) {
+      global_best = best;
+
+      raft::update_host(addr_best_x_pos.data(), results_.best_x_pos, 1, stream_);
+      raft::update_host(addr_best_y_pos.data(), results_.best_y_pos, 1, stream_);
+      raft::update_host(addr_best_route.data(), results_.best_route, 1, stream_);
+      CUDA_TRY(cudaStreamSynchronize(stream_));
+
+      raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_);
+      raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_);
+      raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_);
+      raft::copy(route_, addr_best_route[0], nodes_, stream_);
+      CHECK_CUDA(stream_);
+    }
+  }
+
+  for (auto i = 0; i < nodes_; ++i) {
+    if (verbose_) { std::cout << h_route[i] << ": " << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
+    final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
+  }
+
+  if (verbose_) {
+    hr_timer.display(std::cout);
+    std::cout << "Optimized tour length = " << global_best << "\n";
+  }
+
+  return final_cost;
+}
+
+void TSP::knn()
+{
+  if (verbose_) std::cout << "Looking at " << k_ << " nearest neighbors\n";
+
+  int dim              = 2;
+  bool row_major_order = false;
+
+  rmm::device_uvector<float> input(nodes_ * dim, stream_);
+  float* input_ptr = input.data();
+  raft::copy(input_ptr, x_pos_, nodes_, stream_);
+  raft::copy(input_ptr + nodes_, y_pos_, nodes_, stream_);
+
+  rmm::device_uvector<float> search_data(nodes_ * dim, stream_);
+  float* search_data_ptr = search_data.data();
+  raft::copy(search_data_ptr, input_ptr, nodes_ * dim, stream_);
+
+  rmm::device_uvector<float> distances(nodes_ * (k_ + 1), stream_);
+  float* distances_ptr = distances.data();
+
+  std::vector<float*> input_vec;
+  std::vector<int> sizes_vec;
+  input_vec.push_back(input_ptr);
+  sizes_vec.push_back(nodes_);
+
+  // k neighbors + 1 is needed because the nearest neighbor of each point is
+  // the point itself that we don't want to take into account.
+
+  raft::spatial::knn::brute_force_knn(handle_,
+                                      input_vec,
+                                      sizes_vec,
+                                      dim,
+                                      search_data_ptr,
+                                      nodes_,
+                                      neighbors_,
+                                      distances_ptr,
+                                      k_ + 1,
+                                      row_major_order,
+                                      row_major_order);
+}
+}  // namespace detail
+
+float traveling_salesperson(raft::handle_t const& handle,
+                            int const* vtx_ptr,
+                            float const* x_pos,
+                            float const* y_pos,
+                            int nodes,
+                            int restarts,
+                            bool beam_search,
+                            int k,
+                            int nstart,
+                            bool verbose,
+                            int* route)
+{
+  RAFT_EXPECTS(route != nullptr, "route should equal the number of nodes");
+  RAFT_EXPECTS(nodes > 0, "nodes should be strictly positive");
+  RAFT_EXPECTS(restarts > 0, "restarts should be strictly positive");
+  RAFT_EXPECTS(nstart >= 0 && nstart < nodes, "nstart should be between 0 and nodes - 1");
+  RAFT_EXPECTS(k > 0, "k should be strictly positive");
+
+  cugraph::detail::TSP tsp(
+    handle, vtx_ptr, x_pos, y_pos, nodes, restarts, beam_search, k, nstart, verbose, route);
+  return tsp.compute();
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/traversal/tsp.hpp b/cpp/src/traversal/tsp.hpp
new file mode 100644
index 00000000000..f052462156f
--- /dev/null
+++ b/cpp/src/traversal/tsp.hpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2014-2020, Texas State University. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/algorithms.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace detail {
+
+struct TSPResults {
+  float** best_x_pos;
+  float** best_y_pos;
+  int** best_route;
+  int* best_cost;
+};
+
+class TSP {
+ public:
+  TSP(raft::handle_t const& handle,
+      int const* vtx_ptr,
+      float const* x_pos,
+      float const* y_pos,
+      int nodes,
+      int restarts,
+      bool beam_search,
+      int k,
+      int nstart,
+      bool verbose,
+      int* route);
+
+  void setup();
+  void reset_batch();
+  void get_initial_solution(int const batch);
+  float compute();
+  void knn();
+  ~TSP(){};
+
+ private:
+  // Config
+  raft::handle_t const& handle_;
+  cudaStream_t stream_;
+  int max_blocks_;
+  int max_threads_;
+  int warp_size_;
+  int sm_count_;
+  // how large a grid we want to run, this is fixed
+  int restart_batch_;
+  int best_thread_num_;
+
+  // TSP
+  int const* vtx_ptr_;
+  int* route_;
+  float const* x_pos_;
+  float const* y_pos_;
+  int nodes_;
+  int restarts_;
+  bool beam_search_;
+  int k_;
+  int nstart_;
+  bool verbose_;
+
+  // Scalars
+  rmm::device_scalar<int> mylock_scalar_;
+  rmm::device_scalar<int> best_cost_scalar_;
+
+  int* mylock_;
+  int* best_cost_;
+
+  // Vectors
+  rmm::device_uvector<int64_t> neighbors_vec_;
+  rmm::device_uvector<int> work_vec_;
+  rmm::device_uvector<float*> best_x_pos_vec_;
+  rmm::device_uvector<float*> best_y_pos_vec_;
+  rmm::device_uvector<int*> best_route_vec_;
+
+  int64_t* neighbors_;
+  int* work_;
+  int* work_route_;
+  TSPResults results_;
+};
+
+class VerboseTimer {
+ public:
+  VerboseTimer(char const* name, HighResTimer& hr_timer, bool verbose)
+    : name_(name), hr_timer_(hr_timer), verbose_(verbose)
+  {
+    if (verbose_) hr_timer_.start(name_);
+  }
+
+  ~VerboseTimer()
+  {
+    if (verbose_) hr_timer_.stop();
+  }
+
+ private:
+  const char* name_;
+  HighResTimer& hr_timer_;
+  bool verbose_;
+};
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/traversal/tsp_solver.hpp b/cpp/src/traversal/tsp_solver.hpp
new file mode 100644
index 00000000000..9d36357046f
--- /dev/null
+++ b/cpp/src/traversal/tsp_solver.hpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2014-2020, Texas State University. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <curand_kernel.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <raft/cuda_utils.cuh>
+
+#include "tsp_utils.hpp"
+
+namespace cugraph {
+namespace detail {
+
+__global__ void random_init(int* work,
+                            float const* posx,
+                            float const* posy,
+                            int const* vtx_ptr,
+                            int const nstart,
+                            int const nodes,
+                            int const batch,
+                            int const restart_batch)
+{
+  int* buf  = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)];
+  float* px = (float*)(&buf[nodes]);
+  float* py = &px[nodes + 1];
+  int* path = (int*)(&py[nodes + 1]);
+
+  // Fill values
+  for (int i = threadIdx.x; i <= nodes; i += blockDim.x) {
+    px[i]   = posx[i];
+    py[i]   = posy[i];
+    path[i] = vtx_ptr[i];
+  }
+
+  __syncthreads();
+  // serial permutation as starting point
+  if (threadIdx.x == 0) {
+    // swap to start at nstart node
+    raft::swapVals(px[0], px[nstart]);
+    raft::swapVals(py[0], py[nstart]);
+    raft::swapVals(path[0], path[nstart]);
+
+    curandState rndstate;
+    curand_init(blockIdx.x + (restart_batch * batch), 0, 0, &rndstate);
+    for (int i = 1; i < nodes; i++) {
+      int j = curand(&rndstate) % (nodes - 1 - i) + i;
+      if (i == j) continue;
+      raft::swapVals(px[i], px[j]);
+      raft::swapVals(py[i], py[j]);
+      raft::swapVals(path[i], path[j]);
+    }
+    // close the loop now, avoid special cases later
+    px[nodes]   = px[0];
+    py[nodes]   = py[0];
+    path[nodes] = path[0];
+  }
+}
+
+__global__ void knn_init(int* work,
+                         float const* posx,
+                         float const* posy,
+                         int const* vtx_ptr,
+                         int64_t const* neighbors,
+                         int const nstart,
+                         int const nodes,
+                         int const K,
+                         int const batch,
+                         int const restart_batch)
+{
+  int* buf  = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)];
+  float* px = (float*)(&buf[nodes]);
+  float* py = &px[nodes + 1];
+  int* path = (int*)(&py[nodes + 1]);
+
+  for (int i = threadIdx.x; i < nodes; i += blockDim.x)
+    buf[i] = 0;
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    curandState rndstate;
+    curand_init(blockIdx.x + (restart_batch * batch), 0, 0, &rndstate);
+    int progress = 0;
+
+    px[0]     = posx[nstart];
+    py[0]     = posy[nstart];
+    path[0]   = vtx_ptr[nstart];
+    int head  = nstart;
+    int v     = 0;
+    buf[head] = 1;
+    while (progress < nodes - 1) {  // beam search as starting point
+      for (int i = 1; i <= progress; i++)
+        buf[i] = 0;
+      progress      = 0;  // reset current location in path and visited array
+      int randjumps = 0;
+      while (progress < nodes - 1) {
+        int nj     = curand(&rndstate) % K;
+        int linked = 0;
+        for (int nh = 0; nh < K; ++nh) {
+          // offset (idx / K) + 1 filters the points as their own nearest neighbors.
+          int offset = (K * head + nj) / K + 1;
+          v          = neighbors[K * head + nj + offset];
+          if (v < nodes && buf[v] == 0) {
+            head = v;
+            progress += 1;
+            buf[head] = 1;
+            linked    = 1;
+            break;
+          }
+          nj = (nj + 1) % K;
+        }
+        if (linked == 0) {
+          if (randjumps > nodes - 1)
+            break;  // give up on this traversal, we failed to find a next link
+          randjumps += 1;
+          int nr = (head + 1) % nodes;  // jump to next node
+          while (buf[nr] == 1) {
+            nr = (nr + 1) % nodes;
+          }
+          head = nr;
+          progress += 1;
+          buf[head] = 1;
+        }
+        // copy from input into beam-search order, update len
+        px[progress]   = posx[head];
+        py[progress]   = posy[head];
+        path[progress] = vtx_ptr[head];
+      }
+    }
+    px[nodes]   = px[nstart];
+    py[nodes]   = py[nstart];
+    path[nodes] = path[nstart];
+  }
+}
+
+__device__ void two_opt_search(
+  int* buf, float* px, float* py, int* shbuf, int* minchange, int* mini, int* minj, int const nodes)
+{
+  __shared__ float shmem_x[tilesize];
+  __shared__ float shmem_y[tilesize];
+
+  for (int ii = 0; ii < nodes - 2; ii += blockDim.x) {
+    int i = ii + threadIdx.x;
+    float pxi0, pyi0, pxi1, pyi1, pxj1, pyj1;
+    if (i < nodes - 2) {
+      minchange[0] -= buf[i];
+      pxi0 = px[i];
+      pyi0 = py[i];
+      pxi1 = px[i + 1];
+      pyi1 = py[i + 1];
+      pxj1 = px[nodes];
+      pyj1 = py[nodes];
+    }
+    for (int jj = nodes - 1; jj >= ii + 2; jj -= tilesize) {
+      int bound = jj - tilesize + 1;
+      for (int k = threadIdx.x; k < tilesize; k += blockDim.x) {
+        if (k + bound >= ii + 2) {
+          shmem_x[k] = px[k + bound];
+          shmem_y[k] = py[k + bound];
+          shbuf[k]   = buf[k + bound];
+        }
+      }
+      __syncthreads();
+
+      int lower = bound;
+      if (lower < (i + 2)) lower = i + 2;
+      for (int j = jj; j >= lower; j--) {
+        int jm     = j - bound;
+        float pxj0 = shmem_x[jm];
+        float pyj0 = shmem_y[jm];
+        int delta =
+          shbuf[jm] +
+          __float2int_rn(sqrtf((pxi0 - pxj0) * (pxi0 - pxj0) + (pyi0 - pyj0) * (pyi0 - pyj0))) +
+          __float2int_rn(sqrtf((pxi1 - pxj1) * (pxi1 - pxj1) + (pyi1 - pyj1) * (pyi1 - pyj1)));
+        pxj1 = pxj0;
+        pyj1 = pyj0;
+
+        if (delta < minchange[0]) {
+          minchange[0] = delta;
+          mini[0]      = i;
+          minj[0]      = j;
+        }
+      }
+      __syncthreads();
+    }
+
+    if (i < nodes - 2) { minchange[0] += buf[i]; }
+  }
+}
+
+__global__ __launch_bounds__(2048, 2) void search_solution(TSPResults results,
+                                                           int* mylock,
+                                                           int const* vtx_ptr,
+                                                           bool beam_search,
+                                                           int const K,
+                                                           int nodes,
+                                                           float const* posx,
+                                                           float const* posy,
+                                                           int* work,
+                                                           int const nstart)
+{
+  int* buf  = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)];
+  float* px = (float*)(&buf[nodes]);
+  float* py = &px[nodes + 1];
+  int* path = (int*)(&py[nodes + 1]);
+
+  __shared__ int shbuf[tilesize];
+  __shared__ int best_change[kswaps];
+  __shared__ int best_i[kswaps];
+  __shared__ int best_j[kswaps];
+
+  int minchange;
+  int mini;
+  int minj;
+  int kswaps_active = kswaps;
+  int myswaps       = 0;
+
+  // Hill climbing, iteratively improve from the starting guess
+  do {
+    if (threadIdx.x == 0) {
+      for (int k = 0; k < kswaps; k++) {
+        best_change[k] = 0;
+        best_i[k]      = 0;
+        best_j[k]      = 0;
+      }
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < nodes; i += blockDim.x) {
+      buf[i] = -__float2int_rn(euclidean_dist(px, py, i, i + 1));
+    }
+    __syncthreads();
+
+    // Reset
+    minchange = 0;
+    mini      = 0;
+    minj      = 0;
+
+    // Find best indices
+    two_opt_search(buf, px, py, shbuf, &minchange, &mini, &minj, nodes);
+    __syncthreads();
+
+    shbuf[threadIdx.x] = minchange;
+
+    int j = blockDim.x;  // warp reduction to find best thread results
+    do {
+      int k = (j + 1) / 2;
+      if ((threadIdx.x + k) < j) {
+        shbuf[threadIdx.x] = min(shbuf[threadIdx.x + k], shbuf[threadIdx.x]);
+      }
+      j = k;
+      __syncthreads();
+    } while (j > 1);  // thread winner for this k is in shbuf[0]
+
+    if (threadIdx.x == 0) {
+      best_change[0] = shbuf[0];  // sort best result in shared
+    }
+    __syncthreads();
+
+    if (minchange == shbuf[0]) {  // My thread is as good as the winner
+      shbuf[1] = threadIdx.x;     // store thread ID in shbuf[1]
+    }
+    __syncthreads();
+
+    if (threadIdx.x == shbuf[1]) {  // move from thread local to shared
+      best_i[0] = mini;             // shared best indices for compatibility checks
+      best_j[0] = minj;
+    }
+    __syncthreads();
+
+    // look for more compatible swaps
+    for (int kmin = 1; kmin < kswaps_active; kmin++) {
+      // disallow swaps that conflict with ones already picked
+      for (int kchk = kmin - 1; kchk >= 0; --kchk) {
+        if ((mini < (best_j[kchk] + 1)) && (minj > (best_i[kchk] - 1))) {
+          minchange = shbuf[threadIdx.x] = 0;
+        }
+        __syncthreads();
+      }
+      shbuf[threadIdx.x] = minchange;
+
+      j = blockDim.x;
+      do {
+        int k = (j + 1) / 2;
+        if ((threadIdx.x + k) < j) {
+          shbuf[threadIdx.x] = min(shbuf[threadIdx.x + k], shbuf[threadIdx.x]);
+        }
+        j = k;
+        __syncthreads();
+      } while (j > 1);  // thread winner for this k is in shbuf[0]
+
+      if (threadIdx.x == 0) {
+        best_change[kmin] = shbuf[0];  // store best result in shared
+      }
+      __syncthreads();
+
+      if (minchange == shbuf[0]) {  // My thread is as good as the winner
+        shbuf[1] = threadIdx.x;     // store thread ID in shbuf[1]
+        __threadfence_block();
+      }
+      __syncthreads();
+
+      if (threadIdx.x == shbuf[1]) {  // move from thread local to shared
+        best_i[kmin] = mini;          // store swap targets
+        best_j[kmin] = minj;
+        __threadfence_block();
+      }
+      __syncthreads();
+      // look for the best compatible move
+    }  // end loop over kmin
+    minchange = best_change[0];
+    myswaps += 1;
+    for (int kmin = 0; kmin < kswaps_active; kmin++) {
+      int sum = best_i[kmin] + best_j[kmin] + 1;  // = mini + minj +1
+      // this is a reversal of all nodes included in the range [ i+1, j ]
+      for (int i = threadIdx.x; (i + i) < sum; i += blockDim.x) {
+        if (best_i[kmin] < i) {
+          int j = sum - i;
+          raft::swapVals(px[i], px[j]);
+          raft::swapVals(py[i], py[j]);
+          raft::swapVals(path[i], path[j]);
+        }
+      }
+      __syncthreads();
+    }
+  } while (minchange < 0 && myswaps < 2 * nodes);
+}
+
+__global__ void get_optimal_tour(TSPResults results, int* mylock, int* work, int const nodes)
+{
+  extern __shared__ int accumulator[];
+  int climber_id = blockIdx.x;
+  int* buf       = &work[climber_id * ((4 * nodes + 3 + 31) / 32 * 32)];
+  float* px      = (float*)(&buf[nodes]);
+  float* py      = &px[nodes + 1];
+  int* path      = (int*)(&py[nodes + 1]);
+
+  // Now find actual length of the last tour, result of the climb
+  int term = 0;
+  for (int i = threadIdx.x; i < nodes; i += blockDim.x) {
+    term += __float2int_rn(euclidean_dist(px, py, i, i + 1));
+  }
+  accumulator[threadIdx.x] = term;
+  __syncthreads();
+
+  int j = blockDim.x;  // block level reduction
+  do {
+    int k = (j + 1) / 2;
+    if ((threadIdx.x + k) < j) { accumulator[threadIdx.x] += accumulator[threadIdx.x + k]; }
+    j = k;  // divide active warp size in half
+    __syncthreads();
+  } while (j > 1);
+  term = accumulator[0];
+
+  if (threadIdx.x == 0) {
+    atomicMin(results.best_cost, term);
+    while (atomicExch(mylock, 1) != 0)
+      ;  // acquire
+    if (results.best_cost[0] == term) {
+      results.best_x_pos[0] = px;
+      results.best_y_pos[0] = py;
+      results.best_route[0] = path;
+    }
+    *mylock = 0;  // release
+    __threadfence();
+  }
+}
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/traversal/tsp_utils.hpp b/cpp/src/traversal/tsp_utils.hpp
new file mode 100644
index 00000000000..eab5c09eb2f
--- /dev/null
+++ b/cpp/src/traversal/tsp_utils.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2014-2020, Texas State University. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define tilesize 128
+#define kswaps   4
+
+#include <sys/time.h>
+#include <string>
+#include <vector>
+
+namespace cugraph {
+namespace detail {
+
+constexpr float euclidean_dist(float* px, float* py, int a, int b)
+{
+  return sqrtf((px[a] - px[b]) * (px[a] - px[b]) + (py[a] - py[b]) * (py[a] - py[b]));
+}
+
+// Get maximum number of threads we can run on based on number of nodes,
+// shared memory usage, max threads per block and SM, max blocks for SM and registers per SM.
+int best_thread_count(int nodes, int max_threads, int sm_count, int warp_size)
+{
+  int smem, blocks, thr, perf;
+  int const max_threads_sm = 2048;
+  int max                  = nodes - 2;
+  int best                 = 0;
+  int bthr                 = 4;
+
+  if (max > max_threads) max = max_threads;
+
+  for (int threads = 1; threads <= max; ++threads) {
+    smem   = sizeof(int) * threads + 2 * sizeof(float) * tilesize + sizeof(int) * tilesize;
+    blocks = (16384 * 2) / smem;
+    if (blocks > sm_count) blocks = sm_count;
+    thr = (threads + warp_size - 1) / warp_size * warp_size;
+    while (blocks * thr > max_threads_sm)
+      blocks--;
+    perf = threads * blocks;
+    if (perf > best) {
+      best = perf;
+      bthr = threads;
+    }
+  }
+
+  return bthr;
+}
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/traversal/two_hop_neighbors.cu b/cpp/src/traversal/two_hop_neighbors.cu
index fb984dae0ad..e1fce911130 100644
--- a/cpp/src/traversal/two_hop_neighbors.cu
+++ b/cpp/src/traversal/two_hop_neighbors.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
  * ---------------------------------------------------------------------------**/
 
 #include <rmm/thrust_rmm_allocator.h>
-#include <algorithms.hpp>
-#include <graph.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 #include "two_hop_neighbors.cuh"
 
 #include <thrust/execution_policy.h>
@@ -32,12 +32,13 @@
 namespace cugraph {
 
 template <typename VT, typename ET, typename WT>
-std::unique_ptr<GraphCOO<VT, ET, WT>> get_two_hop_neighbors(GraphCSRView<VT, ET, WT> const &graph)
+std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> get_two_hop_neighbors(
+  legacy::GraphCSRView<VT, ET, WT> const& graph)
 {
   cudaStream_t stream{nullptr};
 
   rmm::device_vector<ET> exsum_degree(graph.number_of_edges + 1);
-  ET *d_exsum_degree = exsum_degree.data().get();
+  ET* d_exsum_degree = exsum_degree.data().get();
 
   // Find the degree of the out vertex of each edge
   degree_iterator<ET> deg_it(graph.offsets);
@@ -62,14 +63,14 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> get_two_hop_neighbors(GraphCSRView<VT, ET,
   rmm::device_vector<VT> first_pair(output_size);
   rmm::device_vector<VT> second_pair(output_size);
 
-  VT *d_first_pair  = first_pair.data().get();
-  VT *d_second_pair = second_pair.data().get();
+  VT* d_first_pair  = first_pair.data().get();
+  VT* d_second_pair = second_pair.data().get();
 
   // Figure out number of blocks and allocate memory for block bucket offsets
   ET num_blocks = (output_size + TWO_HOP_BLOCK_SIZE - 1) / TWO_HOP_BLOCK_SIZE;
   rmm::device_vector<ET> block_bucket_offsets(num_blocks + 1);
 
-  ET *d_block_bucket_offsets = block_bucket_offsets.data().get();
+  ET* d_block_bucket_offsets = block_bucket_offsets.data().get();
 
   // Compute the block bucket offsets
   dim3 grid, block;
@@ -108,7 +109,8 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> get_two_hop_neighbors(GraphCSRView<VT, ET,
   // Get things ready to return
   ET outputSize = tuple_end - tuple_start;
 
-  auto result = std::make_unique<GraphCOO<VT, ET, WT>>(graph.number_of_vertices, outputSize, false);
+  auto result =
+    std::make_unique<legacy::GraphCOO<VT, ET, WT>>(graph.number_of_vertices, outputSize, false);
 
   cudaMemcpy(result->src_indices(), d_first_pair, sizeof(VT) * outputSize, cudaMemcpyDefault);
   cudaMemcpy(result->dst_indices(), d_second_pair, sizeof(VT) * outputSize, cudaMemcpyDefault);
@@ -116,10 +118,10 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> get_two_hop_neighbors(GraphCSRView<VT, ET,
   return result;
 }
 
-template std::unique_ptr<GraphCOO<int, int, float>> get_two_hop_neighbors(
-  GraphCSRView<int, int, float> const &);
+template std::unique_ptr<legacy::GraphCOO<int, int, float>> get_two_hop_neighbors(
+  legacy::GraphCSRView<int, int, float> const&);
 
-template std::unique_ptr<GraphCOO<int, int, double>> get_two_hop_neighbors(
-  GraphCSRView<int, int, double> const &);
+template std::unique_ptr<legacy::GraphCOO<int, int, double>> get_two_hop_neighbors(
+  legacy::GraphCSRView<int, int, double> const&);
 
 }  // namespace cugraph
diff --git a/cpp/src/traversal/two_hop_neighbors.cuh b/cpp/src/traversal/two_hop_neighbors.cuh
index 87d3b36b861..e830fb4a95f 100644
--- a/cpp/src/traversal/two_hop_neighbors.cuh
+++ b/cpp/src/traversal/two_hop_neighbors.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,13 @@
 
 #include <thrust/tuple.h>
 
-#define MAXBLOCKS 65535
+#define MAXBLOCKS          65535
 #define TWO_HOP_BLOCK_SIZE 512
 
 template <typename edge_t>
 struct degree_iterator {
-  edge_t const *offsets;
-  degree_iterator(edge_t const *_offsets) : offsets(_offsets) {}
+  edge_t const* offsets;
+  degree_iterator(edge_t const* _offsets) : offsets(_offsets) {}
 
   __host__ __device__ edge_t operator[](edge_t place)
   {
@@ -53,7 +53,7 @@ struct self_loop_flagger {
 };
 
 template <typename edge_t>
-__device__ edge_t binsearch_maxle(const edge_t *vec, const edge_t val, edge_t low, edge_t high)
+__device__ edge_t binsearch_maxle(const edge_t* vec, const edge_t val, edge_t low, edge_t high)
 {
   while (true) {
     if (low == high) return low;  // we know it exists
@@ -69,8 +69,8 @@ __device__ edge_t binsearch_maxle(const edge_t *vec, const edge_t val, edge_t lo
 }
 
 template <typename edge_t>
-__global__ void compute_bucket_offsets_kernel(const edge_t *frontier_degrees_exclusive_sum,
-                                              edge_t *bucket_offsets,
+__global__ void compute_bucket_offsets_kernel(const edge_t* frontier_degrees_exclusive_sum,
+                                              edge_t* bucket_offsets,
                                               const edge_t frontier_size,
                                               edge_t total_degree)
 {
@@ -86,15 +86,15 @@ __global__ void compute_bucket_offsets_kernel(const edge_t *frontier_degrees_exc
 }
 
 template <typename vertex_t, typename edge_t>
-__global__ void scatter_expand_kernel(const edge_t *exsum_degree,
-                                      const vertex_t *indices,
-                                      const edge_t *offsets,
-                                      const edge_t *bucket_offsets,
+__global__ void scatter_expand_kernel(const edge_t* exsum_degree,
+                                      const vertex_t* indices,
+                                      const edge_t* offsets,
+                                      const edge_t* bucket_offsets,
                                       vertex_t num_verts,
                                       edge_t max_item,
                                       edge_t max_block,
-                                      vertex_t *output_first,
-                                      vertex_t *output_second)
+                                      vertex_t* output_first,
+                                      vertex_t* output_second)
 {
   __shared__ edge_t blockRange[2];
   for (edge_t bid = blockIdx.x; bid < max_block; bid += gridDim.x) {
diff --git a/cpp/src/tree/mst.cu b/cpp/src/tree/mst.cu
index cc3bdc64a2d..e6caa629cd1 100644
--- a/cpp/src/tree/mst.cu
+++ b/cpp/src/tree/mst.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
  * @file mst.cu
  * ---------------------------------------------------------------------------**/
 
-#include <algorithms.hpp>
+#include <cugraph/algorithms.hpp>
 #include <memory>
 #include <utility>
 
@@ -28,8 +28,8 @@
 #include <thrust/transform.h>
 #include <ctime>
 
-#include <graph.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <raft/sparse/mst/mst.cuh>
 
@@ -38,10 +38,10 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<GraphCOO<vertex_t, edge_t, weight_t>> mst_impl(
-  raft::handle_t const &handle,
-  GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCOO<vertex_t, edge_t, weight_t>> mst_impl(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  rmm::mr::device_memory_resource* mr)
 
 {
   auto stream = handle.get_stream();
@@ -55,33 +55,33 @@ std::unique_ptr<GraphCOO<vertex_t, edge_t, weight_t>> mst_impl(
                                                               colors.data(),
                                                               stream);
 
-  GraphCOOContents<vertex_t, edge_t, weight_t> coo_contents{
+  legacy::GraphCOOContents<vertex_t, edge_t, weight_t> coo_contents{
     graph.number_of_vertices,
     mst_edges.n_edges,
     std::make_unique<rmm::device_buffer>(mst_edges.src.release()),
     std::make_unique<rmm::device_buffer>(mst_edges.dst.release()),
     std::make_unique<rmm::device_buffer>(mst_edges.weights.release())};
 
-  return std::make_unique<GraphCOO<vertex_t, edge_t, weight_t>>(std::move(coo_contents));
+  return std::make_unique<legacy::GraphCOO<vertex_t, edge_t, weight_t>>(std::move(coo_contents));
 }
 
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<GraphCOO<vertex_t, edge_t, weight_t>> minimum_spanning_tree(
-  raft::handle_t const &handle,
-  GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  rmm::mr::device_memory_resource *mr)
+std::unique_ptr<legacy::GraphCOO<vertex_t, edge_t, weight_t>> minimum_spanning_tree(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  rmm::mr::device_memory_resource* mr)
 {
   return detail::mst_impl(handle, graph, mr);
 }
 
-template std::unique_ptr<GraphCOO<int, int, float>> minimum_spanning_tree<int, int, float>(
-  raft::handle_t const &handle,
-  GraphCSRView<int, int, float> const &graph,
-  rmm::mr::device_memory_resource *mr);
-template std::unique_ptr<GraphCOO<int, int, double>> minimum_spanning_tree<int, int, double>(
-  raft::handle_t const &handle,
-  GraphCSRView<int, int, double> const &graph,
-  rmm::mr::device_memory_resource *mr);
+template std::unique_ptr<legacy::GraphCOO<int, int, float>> minimum_spanning_tree<int, int, float>(
+  raft::handle_t const& handle,
+  legacy::GraphCSRView<int, int, float> const& graph,
+  rmm::mr::device_memory_resource* mr);
+template std::unique_ptr<legacy::GraphCOO<int, int, double>>
+minimum_spanning_tree<int, int, double>(raft::handle_t const& handle,
+                                        legacy::GraphCSRView<int, int, double> const& graph,
+                                        rmm::mr::device_memory_resource* mr);
 }  // namespace cugraph
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 6c8ef98e2e2..909e3d5b31f 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,24 +14,112 @@
  * limitations under the License.
  */
 
-#include <algorithms.hpp>
-#include <experimental/graph_view.hpp>
-#include <graph.hpp>
-#include <partition_manager.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/cython.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/graph_traits.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+#include <cugraph/utilities/path_retrieval.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
 #include <raft/handle.hpp>
-#include <utilities/cython.hpp>
-#include <utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
+
+#include <numeric>
+#include <vector>
 
 namespace cugraph {
 namespace cython {
 
 namespace detail {
 
-// FIXME: Add description of this function
+// workaround for CUDA extended lambda restrictions
+template <typename vertex_t>
+struct compute_local_partition_id_t {
+  vertex_t const* lasts{nullptr};
+  size_t num_local_partitions{0};
+
+  __device__ size_t operator()(vertex_t v)
+  {
+    for (size_t i = 0; i < num_local_partitions; ++i) {
+      if (v < lasts[i]) { return i; }
+    }
+    return num_local_partitions;
+  }
+};
+
+// FIXME: this is unnecessary if edge_counts_ in the major_minor_weights_t object returned by
+// call_shuffle() is passed back, better be fixed. this code assumes that the entire set of edges
+// for each partition are consecutively stored.
+template <typename vertex_t, typename edge_t, bool transposed>
+std::vector<edge_t> compute_edge_counts(raft::handle_t const& handle,
+                                        graph_container_t const& graph_container)
+{
+  auto num_local_partitions = static_cast<size_t>(graph_container.col_comm_size);
+
+  std::vector<vertex_t> partition_offsets_vector(
+    reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets),
+    reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets) +
+      (graph_container.row_comm_size * graph_container.col_comm_size) + 1);
+
+  std::vector<vertex_t> h_lasts(num_local_partitions);
+  for (size_t i = 0; i < h_lasts.size(); ++i) {
+    h_lasts[i] = partition_offsets_vector[graph_container.row_comm_size * (i + 1)];
+  }
+  rmm::device_uvector<vertex_t> d_lasts(h_lasts.size(), handle.get_stream());
+  raft::update_device(d_lasts.data(), h_lasts.data(), h_lasts.size(), handle.get_stream());
+  auto major_vertices = transposed
+                          ? reinterpret_cast<vertex_t const*>(graph_container.dst_vertices)
+                          : reinterpret_cast<vertex_t const*>(graph_container.src_vertices);
+  auto key_first      = thrust::make_transform_iterator(
+    major_vertices, compute_local_partition_id_t<vertex_t>{d_lasts.data(), num_local_partitions});
+  rmm::device_uvector<size_t> d_local_partition_ids(num_local_partitions, handle.get_stream());
+  rmm::device_uvector<edge_t> d_edge_counts(d_local_partition_ids.size(), handle.get_stream());
+  auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                  key_first,
+                                  key_first + graph_container.num_local_edges,
+                                  thrust::make_constant_iterator(edge_t{1}),
+                                  d_local_partition_ids.begin(),
+                                  d_edge_counts.begin());
+  if (static_cast<size_t>(thrust::distance(d_local_partition_ids.begin(), thrust::get<0>(it))) <
+      num_local_partitions) {
+    rmm::device_uvector<edge_t> d_counts(num_local_partitions, handle.get_stream());
+    thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 d_counts.begin(),
+                 d_counts.end(),
+                 edge_t{0});
+    thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    d_edge_counts.begin(),
+                    thrust::get<1>(it),
+                    d_local_partition_ids.begin(),
+                    d_counts.begin());
+    d_edge_counts = std::move(d_counts);
+  }
+  std::vector<edge_t> h_edge_counts(num_local_partitions, 0);
+  raft::update_host(
+    h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  return h_edge_counts;
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -41,19 +129,32 @@ template <typename vertex_t,
 std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
 create_graph(raft::handle_t const& handle, graph_container_t const& graph_container)
 {
-  std::vector<experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelist(
-    {{reinterpret_cast<vertex_t*>(graph_container.src_vertices),
-      reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
-      reinterpret_cast<weight_t*>(graph_container.weights),
-      static_cast<edge_t>(graph_container.num_partition_edges)}});
+  auto num_local_partitions = static_cast<size_t>(graph_container.col_comm_size);
 
   std::vector<vertex_t> partition_offsets_vector(
     reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets),
     reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets) +
       (graph_container.row_comm_size * graph_container.col_comm_size) + 1);
 
+  auto edge_counts = compute_edge_counts<vertex_t, edge_t, transposed>(handle, graph_container);
+
+  std::vector<edge_t> displacements(edge_counts.size(), 0);
+  std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1);
+
+  std::vector<cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelists(
+    num_local_partitions);
+  for (size_t i = 0; i < edgelists.size(); ++i) {
+    edgelists[i] = cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>{
+      reinterpret_cast<vertex_t*>(graph_container.src_vertices) + displacements[i],
+      reinterpret_cast<vertex_t*>(graph_container.dst_vertices) + displacements[i],
+      graph_container.is_weighted
+        ? std::optional<weight_t const*>(
+            {static_cast<weight_t const*>(graph_container.weights) + displacements[i]})
+        : std::nullopt,
+      edge_counts[i]};
+  }
+
   experimental::partition_t<vertex_t> partition(partition_offsets_vector,
-                                                graph_container.hypergraph_partitioned,
                                                 graph_container.row_comm_size,
                                                 graph_container.col_comm_size,
                                                 graph_container.row_comm_rank,
@@ -61,14 +162,17 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai
 
   return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
-    edgelist,
+    edgelists,
     partition,
     static_cast<vertex_t>(graph_container.num_global_vertices),
     static_cast<edge_t>(graph_container.num_global_edges),
     graph_container.graph_props,
-    // FIXME:  This currently fails if sorted_by_degree is true...
-    // graph_container.sorted_by_degree,
-    false,
+    graph_container.segment_offsets != nullptr
+      ? std::make_optional<std::vector<vertex_t>>(
+          static_cast<vertex_t const*>(graph_container.segment_offsets),
+          static_cast<vertex_t const*>(graph_container.segment_offsets) +
+            graph_container.num_segments + 1)
+      : std::nullopt,
     graph_container.do_expensive_check);
 }
 
@@ -84,15 +188,21 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai
   experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
     reinterpret_cast<vertex_t*>(graph_container.src_vertices),
     reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
-    reinterpret_cast<weight_t*>(graph_container.weights),
-    static_cast<edge_t>(graph_container.num_partition_edges)};
-
+    graph_container.is_weighted
+      ? std::optional<weight_t const*>{reinterpret_cast<weight_t*>(graph_container.weights)}
+      : std::nullopt,
+    static_cast<edge_t>(graph_container.num_local_edges)};
   return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
     edgelist,
     static_cast<vertex_t>(graph_container.num_global_vertices),
     graph_container.graph_props,
-    graph_container.sorted_by_degree,
+    graph_container.segment_offsets != nullptr
+      ? std::make_optional<std::vector<vertex_t>>(
+          static_cast<vertex_t const*>(graph_container.segment_offsets),
+          static_cast<vertex_t const*>(graph_container.segment_offsets) +
+            graph_container.num_segments + 1)
+      : std::nullopt,
     graph_container.do_expensive_check);
 }
 
@@ -107,13 +217,16 @@ void populate_graph_container(graph_container_t& graph_container,
                               void* dst_vertices,
                               void* weights,
                               void* vertex_partition_offsets,
+                              void* segment_offsets,
+                              size_t num_segments,
                               numberTypeEnum vertexType,
                               numberTypeEnum edgeType,
                               numberTypeEnum weightType,
-                              size_t num_partition_edges,
+                              size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
-                              bool sorted_by_degree,
+                              bool is_weighted,
+                              bool is_symmetric,
                               bool transposed,
                               bool multi_gpu)
 {
@@ -121,20 +234,28 @@ void populate_graph_container(graph_container_t& graph_container,
                   "populate_graph_container() can only be called on an empty container.");
 
   bool do_expensive_check{true};
-  bool hypergraph_partitioned{false};
 
-  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_comm_rank = row_comm.get_rank();
-  auto const row_comm_size = row_comm.get_size();  // pcols
-  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_comm_rank = col_comm.get_rank();
-  auto const col_comm_size = col_comm.get_size();  // prows
+  if (multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();  // pcols
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();  // prows
+    graph_container.row_comm_size = row_comm_size;
+    graph_container.col_comm_size = col_comm_size;
+    graph_container.row_comm_rank = row_comm_rank;
+    graph_container.col_comm_rank = col_comm_rank;
+  }
 
-  graph_container.vertex_partition_offsets = vertex_partition_offsets;
   graph_container.src_vertices             = src_vertices;
   graph_container.dst_vertices             = dst_vertices;
   graph_container.weights                  = weights;
-  graph_container.num_partition_edges      = num_partition_edges;
+  graph_container.is_weighted              = is_weighted;
+  graph_container.vertex_partition_offsets = vertex_partition_offsets;
+  graph_container.segment_offsets          = segment_offsets;
+  graph_container.num_segments             = num_segments;
+  graph_container.num_local_edges          = num_local_edges;
   graph_container.num_global_vertices      = num_global_vertices;
   graph_container.num_global_edges         = num_global_edges;
   graph_container.vertexType               = vertexType;
@@ -142,15 +263,10 @@ void populate_graph_container(graph_container_t& graph_container,
   graph_container.weightType               = weightType;
   graph_container.transposed               = transposed;
   graph_container.is_multi_gpu             = multi_gpu;
-  graph_container.hypergraph_partitioned   = hypergraph_partitioned;
-  graph_container.row_comm_size            = row_comm_size;
-  graph_container.col_comm_size            = col_comm_size;
-  graph_container.row_comm_rank            = row_comm_rank;
-  graph_container.col_comm_rank            = col_comm_rank;
-  graph_container.sorted_by_degree         = sorted_by_degree;
   graph_container.do_expensive_check       = do_expensive_check;
 
-  experimental::graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false};
+  experimental::graph_properties_t graph_props{.is_symmetric  = is_symmetric,
+                                               .is_multigraph = false};
   graph_container.graph_props = graph_props;
 
   graph_container.graph_type = graphTypeEnum::graph_t;
@@ -172,7 +288,7 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
                                      int* local_offsets)
 {
   CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null,
-                  "populate_graph_container() can only be called on an empty container.");
+                  "populate_graph_container_legacy() can only be called on an empty container.");
 
   // FIXME: This is soon-to-be legacy code left in place until the new graph_t
   // class is supported everywhere else. Remove everything down to the comment
@@ -182,11 +298,11 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
     switch (legacyType) {
       case graphTypeEnum::LegacyCSR: {
         graph_container.graph_ptr_union.GraphCSRViewFloatPtr =
-          std::make_unique<GraphCSRView<int, int, float>>(reinterpret_cast<int*>(offsets),
-                                                          reinterpret_cast<int*>(indices),
-                                                          reinterpret_cast<float*>(weights),
-                                                          num_global_vertices,
-                                                          num_global_edges);
+          std::make_unique<legacy::GraphCSRView<int, int, float>>(reinterpret_cast<int*>(offsets),
+                                                                  reinterpret_cast<int*>(indices),
+                                                                  reinterpret_cast<float*>(weights),
+                                                                  num_global_vertices,
+                                                                  num_global_edges);
         graph_container.graph_type = graphTypeEnum::GraphCSRViewFloat;
         (graph_container.graph_ptr_union.GraphCSRViewFloatPtr)
           ->set_local_data(local_vertices, local_edges, local_offsets);
@@ -195,11 +311,11 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
       } break;
       case graphTypeEnum::LegacyCSC: {
         graph_container.graph_ptr_union.GraphCSCViewFloatPtr =
-          std::make_unique<GraphCSCView<int, int, float>>(reinterpret_cast<int*>(offsets),
-                                                          reinterpret_cast<int*>(indices),
-                                                          reinterpret_cast<float*>(weights),
-                                                          num_global_vertices,
-                                                          num_global_edges);
+          std::make_unique<legacy::GraphCSCView<int, int, float>>(reinterpret_cast<int*>(offsets),
+                                                                  reinterpret_cast<int*>(indices),
+                                                                  reinterpret_cast<float*>(weights),
+                                                                  num_global_vertices,
+                                                                  num_global_edges);
         graph_container.graph_type = graphTypeEnum::GraphCSCViewFloat;
         (graph_container.graph_ptr_union.GraphCSCViewFloatPtr)
           ->set_local_data(local_vertices, local_edges, local_offsets);
@@ -208,11 +324,11 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
       } break;
       case graphTypeEnum::LegacyCOO: {
         graph_container.graph_ptr_union.GraphCOOViewFloatPtr =
-          std::make_unique<GraphCOOView<int, int, float>>(reinterpret_cast<int*>(offsets),
-                                                          reinterpret_cast<int*>(indices),
-                                                          reinterpret_cast<float*>(weights),
-                                                          num_global_vertices,
-                                                          num_global_edges);
+          std::make_unique<legacy::GraphCOOView<int, int, float>>(reinterpret_cast<int*>(offsets),
+                                                                  reinterpret_cast<int*>(indices),
+                                                                  reinterpret_cast<float*>(weights),
+                                                                  num_global_vertices,
+                                                                  num_global_edges);
         graph_container.graph_type = graphTypeEnum::GraphCOOViewFloat;
         (graph_container.graph_ptr_union.GraphCOOViewFloatPtr)
           ->set_local_data(local_vertices, local_edges, local_offsets);
@@ -226,11 +342,12 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
     switch (legacyType) {
       case graphTypeEnum::LegacyCSR: {
         graph_container.graph_ptr_union.GraphCSRViewDoublePtr =
-          std::make_unique<GraphCSRView<int, int, double>>(reinterpret_cast<int*>(offsets),
-                                                           reinterpret_cast<int*>(indices),
-                                                           reinterpret_cast<double*>(weights),
-                                                           num_global_vertices,
-                                                           num_global_edges);
+          std::make_unique<legacy::GraphCSRView<int, int, double>>(
+            reinterpret_cast<int*>(offsets),
+            reinterpret_cast<int*>(indices),
+            reinterpret_cast<double*>(weights),
+            num_global_vertices,
+            num_global_edges);
         graph_container.graph_type = graphTypeEnum::GraphCSRViewDouble;
         (graph_container.graph_ptr_union.GraphCSRViewDoublePtr)
           ->set_local_data(local_vertices, local_edges, local_offsets);
@@ -239,11 +356,12 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
       } break;
       case graphTypeEnum::LegacyCSC: {
         graph_container.graph_ptr_union.GraphCSCViewDoublePtr =
-          std::make_unique<GraphCSCView<int, int, double>>(reinterpret_cast<int*>(offsets),
-                                                           reinterpret_cast<int*>(indices),
-                                                           reinterpret_cast<double*>(weights),
-                                                           num_global_vertices,
-                                                           num_global_edges);
+          std::make_unique<legacy::GraphCSCView<int, int, double>>(
+            reinterpret_cast<int*>(offsets),
+            reinterpret_cast<int*>(indices),
+            reinterpret_cast<double*>(weights),
+            num_global_vertices,
+            num_global_edges);
         graph_container.graph_type = graphTypeEnum::GraphCSCViewDouble;
         (graph_container.graph_ptr_union.GraphCSCViewDoublePtr)
           ->set_local_data(local_vertices, local_edges, local_offsets);
@@ -252,11 +370,12 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
       } break;
       case graphTypeEnum::LegacyCOO: {
         graph_container.graph_ptr_union.GraphCOOViewDoublePtr =
-          std::make_unique<GraphCOOView<int, int, double>>(reinterpret_cast<int*>(offsets),
-                                                           reinterpret_cast<int*>(indices),
-                                                           reinterpret_cast<double*>(weights),
-                                                           num_global_vertices,
-                                                           num_global_edges);
+          std::make_unique<legacy::GraphCOOView<int, int, double>>(
+            reinterpret_cast<int*>(offsets),
+            reinterpret_cast<int*>(indices),
+            reinterpret_cast<double*>(weights),
+            num_global_vertices,
+            num_global_edges);
         graph_container.graph_type = graphTypeEnum::GraphCOOViewDouble;
         (graph_container.graph_ptr_union.GraphCOOViewDoublePtr)
           ->set_local_data(local_vertices, local_edges, local_offsets);
@@ -463,65 +582,96 @@ void call_pagerank(raft::handle_t const& handle,
                    int64_t max_iter,
                    bool has_guess)
 {
-  if (graph_container.graph_type == graphTypeEnum::GraphCSCViewFloat) {
-    pagerank(handle,
-             *(graph_container.graph_ptr_union.GraphCSCViewFloatPtr),
-             reinterpret_cast<float*>(p_pagerank),
-             static_cast<int32_t>(personalization_subset_size),
-             reinterpret_cast<int32_t*>(personalization_subset),
-             reinterpret_cast<float*>(personalization_values),
-             alpha,
-             tolerance,
-             max_iter,
-             has_guess);
-    graph_container.graph_ptr_union.GraphCSCViewFloatPtr->get_vertex_identifiers(
-      reinterpret_cast<int32_t*>(identifiers));
-  } else if (graph_container.graph_type == graphTypeEnum::GraphCSCViewDouble) {
-    pagerank(handle,
-             *(graph_container.graph_ptr_union.GraphCSCViewDoublePtr),
-             reinterpret_cast<double*>(p_pagerank),
-             static_cast<int32_t>(personalization_subset_size),
-             reinterpret_cast<int32_t*>(personalization_subset),
-             reinterpret_cast<double*>(personalization_values),
-             alpha,
-             tolerance,
-             max_iter,
-             has_guess);
-    graph_container.graph_ptr_union.GraphCSCViewDoublePtr->get_vertex_identifiers(
-      reinterpret_cast<int32_t*>(identifiers));
-  } else if (graph_container.graph_type == graphTypeEnum::graph_t) {
+  if (graph_container.is_multi_gpu) {
+    auto& comm                                 = handle.get_comms();
+    auto aggregate_personalization_subset_size = cugraph::experimental::host_scalar_allreduce(
+      comm, personalization_subset_size, handle.get_stream());
+
     if (graph_container.edgeType == numberTypeEnum::int32Type) {
       auto graph =
         detail::create_graph<int32_t, int32_t, weight_t, true, true>(handle, graph_container);
-      cugraph::experimental::pagerank(handle,
-                                      graph->view(),
-                                      static_cast<weight_t*>(nullptr),
-                                      reinterpret_cast<int32_t*>(personalization_subset),
-                                      reinterpret_cast<weight_t*>(personalization_values),
-                                      static_cast<int32_t>(personalization_subset_size),
-                                      reinterpret_cast<weight_t*>(p_pagerank),
-                                      static_cast<weight_t>(alpha),
-                                      static_cast<weight_t>(tolerance),
-                                      max_iter,
-                                      has_guess,
-                                      false);
+      cugraph::experimental::pagerank<int32_t, int32_t, weight_t>(
+        handle,
+        graph->view(),
+        std::nullopt,
+        aggregate_personalization_subset_size > 0
+          ? std::optional<int32_t const*>{reinterpret_cast<int32_t const*>(personalization_subset)}
+          : std::nullopt,
+        aggregate_personalization_subset_size > 0
+          ? std::optional<weight_t const*>{personalization_values}
+          : std::nullopt,
+        aggregate_personalization_subset_size > 0
+          ? std::optional<int32_t>{static_cast<int32_t>(personalization_subset_size)}
+          : std::nullopt,
+        reinterpret_cast<weight_t*>(p_pagerank),
+        static_cast<weight_t>(alpha),
+        static_cast<weight_t>(tolerance),
+        max_iter,
+        has_guess,
+        true);
     } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
       auto graph =
         detail::create_graph<vertex_t, int64_t, weight_t, true, true>(handle, graph_container);
-      cugraph::experimental::pagerank(handle,
-                                      graph->view(),
-                                      static_cast<weight_t*>(nullptr),
-                                      reinterpret_cast<vertex_t*>(personalization_subset),
-                                      reinterpret_cast<weight_t*>(personalization_values),
-                                      static_cast<vertex_t>(personalization_subset_size),
-                                      reinterpret_cast<weight_t*>(p_pagerank),
-                                      static_cast<weight_t>(alpha),
-                                      static_cast<weight_t>(tolerance),
-                                      max_iter,
-                                      has_guess,
-                                      false);
-    } else {
-      CUGRAPH_FAIL("vertexType/edgeType combination unsupported");
+      cugraph::experimental::pagerank<vertex_t, int64_t, weight_t>(
+        handle,
+        graph->view(),
+        std::nullopt,
+        aggregate_personalization_subset_size > 0
+          ? std::optional<vertex_t const*>{personalization_subset}
+          : std::nullopt,
+        aggregate_personalization_subset_size > 0
+          ? std::optional<weight_t const*>{personalization_values}
+          : std::nullopt,
+        aggregate_personalization_subset_size > 0
+          ? std::optional<vertex_t>{personalization_subset_size}
+          : std::nullopt,
+        reinterpret_cast<weight_t*>(p_pagerank),
+        static_cast<weight_t>(alpha),
+        static_cast<weight_t>(tolerance),
+        max_iter,
+        has_guess,
+        true);
+    }
+  } else {
+    if (graph_container.edgeType == numberTypeEnum::int32Type) {
+      auto graph =
+        detail::create_graph<int32_t, int32_t, weight_t, true, false>(handle, graph_container);
+      cugraph::experimental::pagerank<int32_t, int32_t, weight_t>(
+        handle,
+        graph->view(),
+        std::nullopt,
+        personalization_subset_size > 0
+          ? std::optional<int32_t const*>{reinterpret_cast<int32_t const*>(personalization_subset)}
+          : std::nullopt,
+        personalization_subset_size > 0 ? std::optional<weight_t const*>{personalization_values}
+                                        : std::nullopt,
+        personalization_subset_size > 0 ? std::optional<int32_t>{personalization_subset_size}
+                                        : std::nullopt,
+        reinterpret_cast<weight_t*>(p_pagerank),
+        static_cast<weight_t>(alpha),
+        static_cast<weight_t>(tolerance),
+        max_iter,
+        has_guess,
+        true);
+    } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+      auto graph =
+        detail::create_graph<vertex_t, int64_t, weight_t, true, false>(handle, graph_container);
+      cugraph::experimental::pagerank<vertex_t, int64_t, weight_t>(
+        handle,
+        graph->view(),
+        std::nullopt,
+        personalization_subset_size > 0 ? std::optional<vertex_t const*>{personalization_subset}
+                                        : std::nullopt,
+        personalization_subset_size > 0 ? std::optional<weight_t const*>{personalization_values}
+                                        : std::nullopt,
+        personalization_subset_size > 0 ? std::optional<vertex_t>{personalization_subset_size}
+                                        : std::nullopt,
+        reinterpret_cast<weight_t*>(p_pagerank),
+        static_cast<weight_t>(alpha),
+        static_cast<weight_t>(tolerance),
+        max_iter,
+        has_guess,
+        true);
     }
   }
 }
@@ -591,31 +741,11 @@ void call_bfs(raft::handle_t const& handle,
               vertex_t* identifiers,
               vertex_t* distances,
               vertex_t* predecessors,
-              double* sp_counters,
+              vertex_t depth_limit,
               const vertex_t start_vertex,
-              bool directed)
+              bool direction_optimizing)
 {
-  if (graph_container.graph_type == graphTypeEnum::GraphCSRViewFloat) {
-    graph_container.graph_ptr_union.GraphCSRViewFloatPtr->get_vertex_identifiers(
-      reinterpret_cast<int32_t*>(identifiers));
-    bfs(handle,
-        *(graph_container.graph_ptr_union.GraphCSRViewFloatPtr),
-        reinterpret_cast<int32_t*>(distances),
-        reinterpret_cast<int32_t*>(predecessors),
-        sp_counters,
-        static_cast<int32_t>(start_vertex),
-        directed);
-  } else if (graph_container.graph_type == graphTypeEnum::GraphCSRViewDouble) {
-    graph_container.graph_ptr_union.GraphCSRViewDoublePtr->get_vertex_identifiers(
-      reinterpret_cast<int32_t*>(identifiers));
-    bfs(handle,
-        *(graph_container.graph_ptr_union.GraphCSRViewDoublePtr),
-        reinterpret_cast<int32_t*>(distances),
-        reinterpret_cast<int32_t*>(predecessors),
-        sp_counters,
-        static_cast<int32_t>(start_vertex),
-        directed);
-  } else if (graph_container.graph_type == graphTypeEnum::graph_t) {
+  if (graph_container.is_multi_gpu) {
     if (graph_container.edgeType == numberTypeEnum::int32Type) {
       auto graph =
         detail::create_graph<int32_t, int32_t, weight_t, false, true>(handle, graph_container);
@@ -623,7 +753,9 @@ void call_bfs(raft::handle_t const& handle,
                                  graph->view(),
                                  reinterpret_cast<int32_t*>(distances),
                                  reinterpret_cast<int32_t*>(predecessors),
-                                 static_cast<int32_t>(start_vertex));
+                                 static_cast<int32_t>(start_vertex),
+                                 direction_optimizing,
+                                 static_cast<int32_t>(depth_limit));
     } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
       auto graph =
         detail::create_graph<vertex_t, int64_t, weight_t, false, true>(handle, graph_container);
@@ -631,12 +763,253 @@ void call_bfs(raft::handle_t const& handle,
                                  graph->view(),
                                  reinterpret_cast<vertex_t*>(distances),
                                  reinterpret_cast<vertex_t*>(predecessors),
-                                 static_cast<vertex_t>(start_vertex));
-    } else {
-      CUGRAPH_FAIL("vertexType/edgeType combination unsupported");
+                                 static_cast<vertex_t>(start_vertex),
+                                 direction_optimizing,
+                                 static_cast<vertex_t>(depth_limit));
     }
+  } else {
+    if (graph_container.edgeType == numberTypeEnum::int32Type) {
+      auto graph =
+        detail::create_graph<int32_t, int32_t, weight_t, false, false>(handle, graph_container);
+      cugraph::experimental::bfs(handle,
+                                 graph->view(),
+                                 reinterpret_cast<int32_t*>(distances),
+                                 reinterpret_cast<int32_t*>(predecessors),
+                                 static_cast<int32_t>(start_vertex),
+                                 direction_optimizing,
+                                 static_cast<int32_t>(depth_limit));
+    } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+      auto graph =
+        detail::create_graph<vertex_t, int64_t, weight_t, false, false>(handle, graph_container);
+      cugraph::experimental::bfs(handle,
+                                 graph->view(),
+                                 reinterpret_cast<vertex_t*>(distances),
+                                 reinterpret_cast<vertex_t*>(predecessors),
+                                 static_cast<vertex_t>(start_vertex),
+                                 direction_optimizing,
+                                 static_cast<vertex_t>(depth_limit));
+    }
+  }
+}
+
+// Wrapper for calling extract_egonet through a graph container
+// FIXME : this should not be a legacy COO and it is not clear how to handle C++ api return type as
+// is.graph_container Need to figure out how to return edge lists
+template <typename vertex_t, typename weight_t>
+std::unique_ptr<cy_multi_edgelists_t> call_egonet(raft::handle_t const& handle,
+                                                  graph_container_t const& graph_container,
+                                                  vertex_t* source_vertex,
+                                                  vertex_t n_subgraphs,
+                                                  vertex_t radius)
+{
+  if (graph_container.edgeType == numberTypeEnum::int32Type) {
+    auto graph =
+      detail::create_graph<int32_t, int32_t, weight_t, false, false>(handle, graph_container);
+    auto g = cugraph::experimental::extract_ego(handle,
+                                                graph->view(),
+                                                reinterpret_cast<int32_t*>(source_vertex),
+                                                static_cast<int32_t>(n_subgraphs),
+                                                static_cast<int32_t>(radius));
+    cy_multi_edgelists_t coo_contents{
+      0,  // not used
+      std::get<0>(g).size(),
+      static_cast<size_t>(n_subgraphs),
+      std::make_unique<rmm::device_buffer>(std::get<0>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<1>(g).release()),
+      std::make_unique<rmm::device_buffer>(
+        std::get<2>(g) ? (*std::get<2>(g)).release()
+                       : rmm::device_buffer(size_t{0}, handle.get_stream_view())),
+      std::make_unique<rmm::device_buffer>(std::get<3>(g).release())};
+    return std::make_unique<cy_multi_edgelists_t>(std::move(coo_contents));
+  } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+    auto graph =
+      detail::create_graph<vertex_t, int64_t, weight_t, false, false>(handle, graph_container);
+    auto g = cugraph::experimental::extract_ego(handle,
+                                                graph->view(),
+                                                reinterpret_cast<vertex_t*>(source_vertex),
+                                                static_cast<vertex_t>(n_subgraphs),
+                                                static_cast<vertex_t>(radius));
+    cy_multi_edgelists_t coo_contents{
+      0,  // not used
+      std::get<0>(g).size(),
+      static_cast<size_t>(n_subgraphs),
+      std::make_unique<rmm::device_buffer>(std::get<0>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<1>(g).release()),
+      std::make_unique<rmm::device_buffer>(
+        std::get<2>(g) ? (*std::get<2>(g)).release()
+                       : rmm::device_buffer(size_t{0}, handle.get_stream_view())),
+      std::make_unique<rmm::device_buffer>(std::get<3>(g).release())};
+    return std::make_unique<cy_multi_edgelists_t>(std::move(coo_contents));
+  } else {
+    CUGRAPH_FAIL("vertexType/edgeType combination unsupported");
   }
 }
+// Wrapper for graph generate_rmat_edgelist()
+// to expose the API to cython
+// enum class generator_distribution_t { POWER_LAW = 0, UNIFORM };
+template <typename vertex_t>
+std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist(raft::handle_t const& handle,
+                                                               size_t scale,
+                                                               size_t num_edges,
+                                                               double a,
+                                                               double b,
+                                                               double c,
+                                                               uint64_t seed,
+                                                               bool clip_and_flip,
+                                                               bool scramble_vertex_ids)
+{
+  auto src_dst_tuple = cugraph::generate_rmat_edgelist<vertex_t>(
+    handle, scale, num_edges, a, b, c, seed, clip_and_flip);
+
+  if (scramble_vertex_ids) {
+    cugraph::scramble_vertex_ids<vertex_t>(
+      handle, std::get<0>(src_dst_tuple), std::get<1>(src_dst_tuple), vertex_t{0}, seed);
+  }
+
+  graph_generator_t gg_vals{
+    std::make_unique<rmm::device_buffer>(std::get<0>(src_dst_tuple).release()),
+    std::make_unique<rmm::device_buffer>(std::get<1>(src_dst_tuple).release())};
+
+  return std::make_unique<graph_generator_t>(std::move(gg_vals));
+}
+
+template <typename vertex_t>
+std::vector<std::pair<std::unique_ptr<rmm::device_buffer>, std::unique_ptr<rmm::device_buffer>>>
+call_generate_rmat_edgelists(raft::handle_t const& handle,
+                             size_t n_edgelists,
+                             size_t min_scale,
+                             size_t max_scale,
+                             size_t edge_factor,
+                             cugraph::generator_distribution_t size_distribution,
+                             cugraph::generator_distribution_t edge_distribution,
+                             uint64_t seed,
+                             bool clip_and_flip,
+                             bool scramble_vertex_ids)
+{
+  auto src_dst_vec_tuple = cugraph::generate_rmat_edgelists<vertex_t>(handle,
+                                                                      n_edgelists,
+                                                                      min_scale,
+                                                                      max_scale,
+                                                                      edge_factor,
+                                                                      size_distribution,
+                                                                      edge_distribution,
+                                                                      seed,
+                                                                      clip_and_flip);
+
+  if (scramble_vertex_ids) {
+    std::for_each(
+      src_dst_vec_tuple.begin(), src_dst_vec_tuple.end(), [&handle, seed](auto& src_dst_tuple) {
+        cugraph::scramble_vertex_ids<vertex_t>(
+          handle, std::get<0>(src_dst_tuple), std::get<1>(src_dst_tuple), vertex_t{0}, seed);
+      });
+  }
+
+  std::vector<std::pair<std::unique_ptr<rmm::device_buffer>, std::unique_ptr<rmm::device_buffer>>>
+    gg_vec;
+
+  std::transform(
+    src_dst_vec_tuple.begin(),
+    src_dst_vec_tuple.end(),
+    std::back_inserter(gg_vec),
+    [](auto& tpl_dev_uvec) {
+      return std::make_pair(
+        std::move(std::make_unique<rmm::device_buffer>(std::get<0>(tpl_dev_uvec).release())),
+        std::move(std::make_unique<rmm::device_buffer>(std::get<1>(tpl_dev_uvec).release())));
+    });
+
+  return gg_vec;
+}
+
+// Wrapper for random_walks() through a graph container
+// to expose the API to cython.
+//
+template <typename vertex_t, typename edge_t>
+std::enable_if_t<cugraph::experimental::is_vertex_edge_combo<vertex_t, edge_t>::value,
+                 std::unique_ptr<random_walk_ret_t>>
+call_random_walks(raft::handle_t const& handle,
+                  graph_container_t const& graph_container,
+                  vertex_t const* ptr_start_set,
+                  edge_t num_paths,
+                  edge_t max_depth,
+                  bool use_padding)
+{
+  if (graph_container.weightType == numberTypeEnum::floatType) {
+    using weight_t = float;
+
+    auto graph =
+      detail::create_graph<vertex_t, edge_t, weight_t, false, false>(handle, graph_container);
+
+    auto triplet = cugraph::experimental::random_walks(
+      handle, graph->view(), ptr_start_set, num_paths, max_depth, use_padding);
+
+    random_walk_ret_t rw_tri{std::get<0>(triplet).size(),
+                             std::get<1>(triplet).size(),
+                             static_cast<size_t>(num_paths),
+                             static_cast<size_t>(max_depth),
+                             std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+
+    return std::make_unique<random_walk_ret_t>(std::move(rw_tri));
+
+  } else if (graph_container.weightType == numberTypeEnum::doubleType) {
+    using weight_t = double;
+
+    auto graph =
+      detail::create_graph<vertex_t, edge_t, weight_t, false, false>(handle, graph_container);
+
+    auto triplet = cugraph::experimental::random_walks(
+      handle, graph->view(), ptr_start_set, num_paths, max_depth, use_padding);
+
+    random_walk_ret_t rw_tri{std::get<0>(triplet).size(),
+                             std::get<1>(triplet).size(),
+                             static_cast<size_t>(num_paths),
+                             static_cast<size_t>(max_depth),
+                             std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+
+    return std::make_unique<random_walk_ret_t>(std::move(rw_tri));
+
+  } else {
+    CUGRAPH_FAIL("Unsupported weight type.");
+  }
+}
+
+template <typename index_t>
+std::unique_ptr<random_walk_path_t> call_rw_paths(raft::handle_t const& handle,
+                                                  index_t num_paths,
+                                                  index_t const* vertex_path_sizes)
+{
+  auto triplet =
+    cugraph::experimental::query_rw_sizes_offsets<index_t>(handle, num_paths, vertex_path_sizes);
+  random_walk_path_t rw_path_tri{
+    std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+    std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+    std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+  return std::make_unique<random_walk_path_t>(std::move(rw_path_tri));
+}
+
+template <typename vertex_t, typename index_t>
+std::unique_ptr<random_walk_coo_t> random_walks_to_coo(raft::handle_t const& handle,
+                                                       random_walk_ret_t& rw_tri)
+{
+  auto triplet = cugraph::experimental::convert_paths_to_coo<vertex_t, index_t>(
+    handle,
+    static_cast<index_t>(rw_tri.coalesced_sz_v_),
+    static_cast<index_t>(rw_tri.num_paths_),
+    std::move(*rw_tri.d_coalesced_v_),
+    std::move(*rw_tri.d_sizes_));
+
+  random_walk_coo_t rw_coo{std::get<0>(triplet).size(),
+                           std::get<2>(triplet).size(),
+                           std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+                           std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+                           std::move(rw_tri.d_coalesced_w_),  // pass-through
+                           std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+
+  return std::make_unique<random_walk_coo_t>(std::move(rw_coo));
+}
 
 // Wrapper for calling SSSP through a graph container
 template <typename vertex_t, typename weight_t>
@@ -686,6 +1059,187 @@ void call_sssp(raft::handle_t const& handle,
   }
 }
 
+// wrapper for weakly connected components:
+//
+template <typename vertex_t, typename weight_t>
+void call_wcc(raft::handle_t const& handle,
+              graph_container_t const& graph_container,
+              vertex_t* components)
+{
+  if (graph_container.is_multi_gpu) {
+    if (graph_container.edgeType == numberTypeEnum::int32Type) {
+      auto graph =
+        detail::create_graph<int32_t, int32_t, weight_t, false, true>(handle, graph_container);
+      cugraph::experimental::weakly_connected_components(
+        handle, graph->view(), reinterpret_cast<int32_t*>(components), false);
+
+    } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+      auto graph =
+        detail::create_graph<vertex_t, int64_t, weight_t, false, true>(handle, graph_container);
+      cugraph::experimental::weakly_connected_components(
+        handle, graph->view(), reinterpret_cast<vertex_t*>(components), false);
+    }
+  } else {
+    if (graph_container.edgeType == numberTypeEnum::int32Type) {
+      auto graph =
+        detail::create_graph<int32_t, int32_t, weight_t, false, false>(handle, graph_container);
+      cugraph::experimental::weakly_connected_components(
+        handle, graph->view(), reinterpret_cast<int32_t*>(components), false);
+    } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+      auto graph =
+        detail::create_graph<vertex_t, int64_t, weight_t, false, false>(handle, graph_container);
+      cugraph::experimental::weakly_connected_components(
+        handle, graph->view(), reinterpret_cast<vertex_t*>(components), false);
+    }
+  }
+}
+
+// wrapper for shuffling:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
+  raft::handle_t const& handle,
+  vertex_t*
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+  vertex_t* edgelist_minor_vertices,  // [IN / OUT]
+  weight_t* edgelist_weights,         // [IN / OUT]
+  edge_t num_edgelist_edges)
+{
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+
+  std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> ptr_ret =
+    std::make_unique<major_minor_weights_t<vertex_t, edge_t, weight_t>>(handle);
+
+  if (edgelist_weights != nullptr) {
+    auto zip_edge = thrust::make_zip_iterator(
+      thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights));
+
+    std::forward_as_tuple(
+      std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()),
+      std::ignore) =
+      cugraph::experimental::groupby_gpuid_and_shuffle_values(
+        comm,  // handle.get_comms(),
+        zip_edge,
+        zip_edge + num_edgelist_edges,
+        [key_func =
+           cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+             comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) {
+          return key_func(thrust::get<0>(val), thrust::get<1>(val));
+        },
+        handle.get_stream());
+  } else {
+    auto zip_edge = thrust::make_zip_iterator(
+      thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
+
+    std::forward_as_tuple(std::tie(ptr_ret->get_major(), ptr_ret->get_minor()),
+                          std::ignore) =
+      cugraph::experimental::groupby_gpuid_and_shuffle_values(
+        comm,  // handle.get_comms(),
+        zip_edge,
+        zip_edge + num_edgelist_edges,
+        [key_func =
+           cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+             comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) {
+          return key_func(thrust::get<0>(val), thrust::get<1>(val));
+        },
+        handle.get_stream());
+  }
+
+  auto local_partition_id_op =
+    [comm_size,
+     key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t<vertex_t>{
+       comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
+      return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
+             comm_size;  // global partition id to local partition id
+    };
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(ptr_ret->get_major().data(), ptr_ret->get_minor().data()));
+
+  auto edge_counts =
+    (edgelist_weights != nullptr)
+      ? cugraph::experimental::groupby_and_count(pair_first,
+                                                 pair_first + ptr_ret->get_major().size(),
+                                                 ptr_ret->get_weights().data(),
+                                                 local_partition_id_op,
+                                                 col_comm_size,
+                                                 handle.get_stream())
+      : cugraph::experimental::groupby_and_count(pair_first,
+                                                 pair_first + ptr_ret->get_major().size(),
+                                                 local_partition_id_op,
+                                                 col_comm_size,
+                                                 handle.get_stream());
+
+  std::vector<size_t> h_edge_counts(edge_counts.size());
+  raft::update_host(
+    h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  ptr_ret->get_edge_counts().resize(h_edge_counts.size());
+  for (size_t i = 0; i < h_edge_counts.size(); ++i) {
+    ptr_ret->get_edge_counts()[i] = static_cast<edge_t>(h_edge_counts[i]);
+  }
+
+  return ptr_ret;  // RVO-ed
+}
+
+// Wrapper for calling renumber_edeglist() inplace:
+// TODO: check if return type needs further handling...
+//
+template <typename vertex_t, typename edge_t>
+std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
+  raft::handle_t const& handle,
+  vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  std::vector<edge_t> const& edge_counts,
+  bool do_expensive_check,
+  bool multi_gpu)  // bc. cython cannot take non-type template params
+{
+  // caveat: return values have different types on the 2 branches below:
+  //
+  std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> p_ret =
+    std::make_unique<renum_tuple_t<vertex_t, edge_t>>(handle);
+
+  if (multi_gpu) {
+    std::vector<edge_t> displacements(edge_counts.size(), edge_t{0});
+    std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1);
+    std::vector<vertex_t*> major_ptrs(edge_counts.size());
+    std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
+    for (size_t i = 0; i < edge_counts.size(); ++i) {
+      major_ptrs[i] = shuffled_edgelist_major_vertices + displacements[i];
+      minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i];
+    }
+
+    std::tie(p_ret->get_dv(),
+             p_ret->get_partition(),
+             p_ret->get_num_vertices(),
+             p_ret->get_num_edges(),
+             p_ret->get_segment_offsets()) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, true>(
+        handle, std::nullopt, major_ptrs, minor_ptrs, edge_counts, do_expensive_check);
+  } else {
+    std::tie(p_ret->get_dv(), p_ret->get_segment_offsets()) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, false>(
+        handle,
+        std::nullopt,
+        shuffled_edgelist_major_vertices,
+        shuffled_edgelist_minor_vertices,
+        edge_counts[0],
+        do_expensive_check);
+
+    p_ret->get_partition() = cugraph::experimental::partition_t<vertex_t>{};  // dummy
+
+    p_ret->get_num_vertices() = static_cast<vertex_t>(p_ret->get_dv().size());
+    p_ret->get_num_edges()    = edge_counts[0];
+  }
+
+  return p_ret;  // RVO-ed (copy ellision)
+}
+
 // Helper for setting up subcommunicators
 void init_subcomms(raft::handle_t& handle, size_t row_comm_size)
 {
@@ -806,36 +1360,103 @@ template void call_bfs<int32_t, float>(raft::handle_t const& handle,
                                        int32_t* identifiers,
                                        int32_t* distances,
                                        int32_t* predecessors,
-                                       double* sp_counters,
+                                       int32_t depth_limit,
                                        const int32_t start_vertex,
-                                       bool directed);
+                                       bool direction_optimizing);
 
 template void call_bfs<int32_t, double>(raft::handle_t const& handle,
                                         graph_container_t const& graph_container,
                                         int32_t* identifiers,
                                         int32_t* distances,
                                         int32_t* predecessors,
-                                        double* sp_counters,
+                                        int32_t depth_limit,
                                         const int32_t start_vertex,
-                                        bool directed);
+                                        bool direction_optimizing);
 
 template void call_bfs<int64_t, float>(raft::handle_t const& handle,
                                        graph_container_t const& graph_container,
                                        int64_t* identifiers,
                                        int64_t* distances,
                                        int64_t* predecessors,
-                                       double* sp_counters,
+                                       int64_t depth_limit,
                                        const int64_t start_vertex,
-                                       bool directed);
+                                       bool direction_optimizing);
 
 template void call_bfs<int64_t, double>(raft::handle_t const& handle,
                                         graph_container_t const& graph_container,
                                         int64_t* identifiers,
                                         int64_t* distances,
                                         int64_t* predecessors,
-                                        double* sp_counters,
+                                        int64_t depth_limit,
                                         const int64_t start_vertex,
-                                        bool directed);
+                                        bool direction_optimizing);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int32_t, float>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t* source_vertex,
+  int32_t n_subgraphs,
+  int32_t radius);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int32_t, double>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t* source_vertex,
+  int32_t n_subgraphs,
+  int32_t radius);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int64_t, float>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int64_t* source_vertex,
+  int64_t n_subgraphs,
+  int64_t radius);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int64_t, double>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int64_t* source_vertex,
+  int64_t n_subgraphs,
+  int64_t radius);
+
+template std::unique_ptr<random_walk_ret_t> call_random_walks<int32_t, int32_t>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t const* ptr_start_set,
+  int32_t num_paths,
+  int32_t max_depth,
+  bool use_padding);
+
+template std::unique_ptr<random_walk_ret_t> call_random_walks<int32_t, int64_t>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t const* ptr_start_set,
+  int64_t num_paths,
+  int64_t max_depth,
+  bool use_padding);
+
+template std::unique_ptr<random_walk_ret_t> call_random_walks<int64_t, int64_t>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int64_t const* ptr_start_set,
+  int64_t num_paths,
+  int64_t max_depth,
+  bool use_padding);
+
+template std::unique_ptr<random_walk_path_t> call_rw_paths<int32_t>(
+  raft::handle_t const& handle, int32_t num_paths, int32_t const* vertex_path_sizes);
+
+template std::unique_ptr<random_walk_path_t> call_rw_paths<int64_t>(
+  raft::handle_t const& handle, int64_t num_paths, int64_t const* vertex_path_sizes);
+
+template std::unique_ptr<random_walk_coo_t> random_walks_to_coo<int32_t, int32_t>(
+  raft::handle_t const& handle, random_walk_ret_t& rw_tri);
+
+template std::unique_ptr<random_walk_coo_t> random_walks_to_coo<int32_t, int64_t>(
+  raft::handle_t const& handle, random_walk_ret_t& rw_tri);
+
+template std::unique_ptr<random_walk_coo_t> random_walks_to_coo<int64_t, int64_t>(
+  raft::handle_t const& handle, random_walk_ret_t& rw_tri);
 
 template void call_sssp(raft::handle_t const& handle,
                         graph_container_t const& graph_container,
@@ -865,5 +1486,137 @@ template void call_sssp(raft::handle_t const& handle,
                         int64_t* predecessors,
                         const int64_t source_vertex);
 
+template void call_wcc<int32_t, float>(raft::handle_t const& handle,
+                                       graph_container_t const& graph_container,
+                                       int32_t* components);
+
+template void call_wcc<int32_t, double>(raft::handle_t const& handle,
+                                        graph_container_t const& graph_container,
+                                        int32_t* components);
+
+template void call_wcc<int64_t, float>(raft::handle_t const& handle,
+                                       graph_container_t const& graph_container,
+                                       int64_t* components);
+
+template void call_wcc<int64_t, double>(raft::handle_t const& handle,
+                                        graph_container_t const& graph_container,
+                                        int64_t* components);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, int32_t, float>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  float* edgelist_weights,
+  int32_t num_edgelist_edges);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, int64_t, float>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  float* edgelist_weights,
+  int64_t num_edgelist_edges);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, int32_t, double>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  double* edgelist_weights,
+  int32_t num_edgelist_edges);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, int64_t, double>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  double* edgelist_weights,
+  int64_t num_edgelist_edges);
+
+template std::unique_ptr<major_minor_weights_t<int64_t, int64_t, float>> call_shuffle(
+  raft::handle_t const& handle,
+  int64_t* edgelist_major_vertices,
+  int64_t* edgelist_minor_vertices,
+  float* edgelist_weights,
+  int64_t num_edgelist_edges);
+
+template std::unique_ptr<major_minor_weights_t<int64_t, int64_t, double>> call_shuffle(
+  raft::handle_t const& handle,
+  int64_t* edgelist_major_vertices,
+  int64_t* edgelist_minor_vertices,
+  double* edgelist_weights,
+  int64_t num_edgelist_edges);
+
+// TODO: add the remaining relevant EIDIr's:
+//
+template std::unique_ptr<renum_tuple_t<int32_t, int32_t>> call_renumber(
+  raft::handle_t const& handle,
+  int32_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int32_t> const& edge_counts,
+  bool do_expensive_check,
+  bool multi_gpu);
+
+template std::unique_ptr<renum_tuple_t<int32_t, int64_t>> call_renumber(
+  raft::handle_t const& handle,
+  int32_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edge_counts,
+  bool do_expensive_check,
+  bool multi_gpu);
+
+template std::unique_ptr<renum_tuple_t<int64_t, int64_t>> call_renumber(
+  raft::handle_t const& handle,
+  int64_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  int64_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edge_counts,
+  bool do_expensive_check,
+  bool multi_gpu);
+
+template std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist<int32_t>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t num_edges,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool clip_and_flip,
+  bool scramble_vertex_ids);
+
+template std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist<int64_t>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t num_edges,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool clip_and_flip,
+  bool scramble_vertex_ids);
+
+template std::vector<
+  std::pair<std::unique_ptr<rmm::device_buffer>, std::unique_ptr<rmm::device_buffer>>>
+call_generate_rmat_edgelists<int32_t>(raft::handle_t const& handle,
+                                      size_t n_edgelists,
+                                      size_t min_scale,
+                                      size_t max_scale,
+                                      size_t edge_factor,
+                                      cugraph::generator_distribution_t size_distribution,
+                                      cugraph::generator_distribution_t edge_distribution,
+                                      uint64_t seed,
+                                      bool clip_and_flip,
+                                      bool scramble_vertex_ids);
+
+template std::vector<
+  std::pair<std::unique_ptr<rmm::device_buffer>, std::unique_ptr<rmm::device_buffer>>>
+call_generate_rmat_edgelists<int64_t>(raft::handle_t const& handle,
+                                      size_t n_edgelists,
+                                      size_t min_scale,
+                                      size_t max_scale,
+                                      size_t edge_factor,
+                                      cugraph::generator_distribution_t size_distribution,
+                                      cugraph::generator_distribution_t edge_distribution,
+                                      uint64_t seed,
+                                      bool clip_and_flip,
+                                      bool scramble_vertex_ids);
+
 }  // namespace cython
 }  // namespace cugraph
diff --git a/cpp/src/utilities/graph_bcast.cu b/cpp/src/utilities/graph_bcast.cu
new file mode 100644
index 00000000000..e06c1508cf9
--- /dev/null
+++ b/cpp/src/utilities/graph_bcast.cu
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#include "graph_bcast.cuh"
+
+namespace cugraph {
+namespace broadcast {
+using namespace cugraph::experimental;
+// Manual template instantiations (EIDir's):
+//
+template graph_t<int32_t, int32_t, float, false, false> graph_broadcast(
+  raft::handle_t const& handle, graph_t<int32_t, int32_t, float, false, false>* graph_ptr);
+
+template graph_t<int32_t, int64_t, float, false, false> graph_broadcast(
+  raft::handle_t const& handle, graph_t<int32_t, int64_t, float, false, false>* graph_ptr);
+
+template graph_t<int64_t, int64_t, float, false, false> graph_broadcast(
+  raft::handle_t const& handle, graph_t<int64_t, int64_t, float, false, false>* graph_ptr);
+
+template graph_t<int32_t, int32_t, double, false, false> graph_broadcast(
+  raft::handle_t const& handle, graph_t<int32_t, int32_t, double, false, false>* graph_ptr);
+
+template graph_t<int32_t, int64_t, double, false, false> graph_broadcast(
+  raft::handle_t const& handle, graph_t<int32_t, int64_t, double, false, false>* graph_ptr);
+
+template graph_t<int64_t, int64_t, double, false, false> graph_broadcast(
+  raft::handle_t const& handle, graph_t<int64_t, int64_t, double, false, false>* graph_ptr);
+
+}  // namespace broadcast
+}  // namespace cugraph
diff --git a/cpp/src/utilities/graph_bcast.cuh b/cpp/src/utilities/graph_bcast.cuh
new file mode 100644
index 00000000000..b4007ad20f2
--- /dev/null
+++ b/cpp/src/utilities/graph_bcast.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#pragma once
+
+#include <cugraph/serialization/serializer.hpp>
+
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+
+#include <thrust/tuple.h>
+
+namespace cugraph {
+namespace broadcast {
+
+/**
+ * @brief broadcasts graph_t object (only the single GPU version).
+ *
+ * @tparam graph_t Type of graph (view).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_ptr pointer to graph object: not `nullptr` on send, `nullptr` (ignored) on receive.
+ * @return graph_t object that was sent/received
+ */
+template <typename graph_t>
+graph_t graph_broadcast(raft::handle_t const& handle, graph_t* graph_ptr)
+{
+  using namespace cugraph::serializer;
+  using namespace cugraph::experimental;
+
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  if constexpr (!graph_t::is_multi_gpu) {
+    if (handle.get_comms().get_rank() == 0) {
+      CUGRAPH_EXPECTS(graph_ptr != nullptr, "Cannot serialize nullptr graph pointer.");
+
+      auto pair = serializer_t::get_device_graph_sz_bytes(*graph_ptr);
+      thrust::tuple<size_t, size_t> dev_sz_host_sz_bytes =
+        thrust::make_tuple(pair.first, pair.second);
+
+      auto total_graph_dev_sz = pair.first + pair.second;
+
+      serializer_t ser(handle, total_graph_dev_sz);
+      serializer_t::graph_meta_t<graph_t> graph_meta{};
+      ser.serialize(*graph_ptr, graph_meta);
+
+      int root{0};
+      host_scalar_bcast(handle.get_comms(), dev_sz_host_sz_bytes, root, handle.get_stream());
+      device_bcast(handle.get_comms(),
+                   ser.get_storage(),
+                   ser.get_storage(),
+                   total_graph_dev_sz,
+                   root,
+                   handle.get_stream());
+
+      return std::move(*graph_ptr);
+    } else {
+      thrust::tuple<size_t, size_t> dev_sz_host_sz_bytes(0, 0);
+
+      int root{0};
+      dev_sz_host_sz_bytes =
+        host_scalar_bcast(handle.get_comms(), dev_sz_host_sz_bytes, root, handle.get_stream());
+      //
+      auto total_graph_dev_sz =
+        thrust::get<0>(dev_sz_host_sz_bytes) + thrust::get<1>(dev_sz_host_sz_bytes);
+
+      CUGRAPH_EXPECTS(total_graph_dev_sz > 0, "Graph size comm failure.");
+
+      rmm::device_uvector<serializer_t::byte_t> data_buffer(total_graph_dev_sz,
+                                                            handle.get_stream_view());
+
+      device_bcast(handle.get_comms(),
+                   data_buffer.data(),
+                   data_buffer.data(),
+                   total_graph_dev_sz,
+                   root,
+                   handle.get_stream());
+
+      serializer_t ser(handle, data_buffer.data());
+      auto graph = ser.unserialize<graph_t>(thrust::get<0>(dev_sz_host_sz_bytes),
+                                            thrust::get<1>(dev_sz_host_sz_bytes));
+
+      return graph;
+    }
+  } else {
+    CUGRAPH_FAIL("Unsupported graph type for broadcasting.");
+
+    return graph_t{handle};
+  }
+}
+
+}  // namespace broadcast
+}  // namespace cugraph
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
index ca0b5831c92..4b7c2baab19 100644
--- a/cpp/src/utilities/graph_utils.cuh
+++ b/cpp/src/utilities/graph_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -13,11 +13,12 @@
 // Author: Alex Fender afender@nvidia.com
 #pragma once
 
-#include <utilities/error.hpp>
+#include <cugraph/utilities/error.hpp>
 
 #include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
 #include <raft/device_atomics.cuh>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -31,12 +32,12 @@ namespace cugraph {
 namespace detail {
 
 //#define DEBUG 1
-#define CUDA_MAX_BLOCKS 65535
+#define CUDA_MAX_BLOCKS         65535
 #define CUDA_MAX_KERNEL_THREADS 256  // kernel will launch at most 256 threads per block
 #define US
 
 template <typename count_t, typename index_t, typename value_t>
-__inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const *ind, value_t const *w)
+__inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const* ind, value_t const* w)
 {
   count_t i, j, mn;
   value_t v, last;
@@ -80,124 +81,110 @@ __inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const *ind,
   return last;
 }
 
-// dot
-template <typename T>
-T dot(size_t n, T *x, T *y)
-{
-  cudaStream_t stream{nullptr};
-  T result = thrust::inner_product(rmm::exec_policy(stream)->on(stream),
-                                   thrust::device_pointer_cast(x),
-                                   thrust::device_pointer_cast(x + n),
-                                   thrust::device_pointer_cast(y),
-                                   0.0f);
-  CHECK_CUDA(stream);
-  return result;
-}
-
 // axpy
 template <typename T>
 struct axpy_functor : public thrust::binary_function<T, T, T> {
   const T a;
   axpy_functor(T _a) : a(_a) {}
-  __host__ __device__ T operator()(const T &x, const T &y) const { return a * x + y; }
+  __host__ __device__ T operator()(const T& x, const T& y) const { return a * x + y; }
 };
 
 template <typename T>
-void axpy(size_t n, T a, T *x, T *y)
+void axpy(size_t n, T a, T* x, T* y)
 {
-  cudaStream_t stream{nullptr};
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  thrust::transform(rmm::exec_policy(stream_view),
                     thrust::device_pointer_cast(x),
                     thrust::device_pointer_cast(x + n),
                     thrust::device_pointer_cast(y),
                     thrust::device_pointer_cast(y),
                     axpy_functor<T>(a));
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 }
 
 // norm
 template <typename T>
 struct square {
-  __host__ __device__ T operator()(const T &x) const { return x * x; }
+  __host__ __device__ T operator()(const T& x) const { return x * x; }
 };
 
 template <typename T>
-T nrm2(size_t n, T *x)
+T nrm2(size_t n, T* x)
 {
-  cudaStream_t stream{nullptr};
+  rmm::cuda_stream_view stream_view;
   T init   = 0;
-  T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream)->on(stream),
+  T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream_view),
                                                 thrust::device_pointer_cast(x),
                                                 thrust::device_pointer_cast(x + n),
                                                 square<T>(),
                                                 init,
                                                 thrust::plus<T>()));
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
   return result;
 }
 
 template <typename T>
-T nrm1(size_t n, T *x)
+T nrm1(size_t n, T* x)
 {
-  cudaStream_t stream{nullptr};
-  T result = thrust::reduce(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  T result = thrust::reduce(rmm::exec_policy(stream_view),
                             thrust::device_pointer_cast(x),
                             thrust::device_pointer_cast(x + n));
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
   return result;
 }
 
 template <typename T>
-void scal(size_t n, T val, T *x)
+void scal(size_t n, T val, T* x)
 {
-  cudaStream_t stream{nullptr};
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  thrust::transform(rmm::exec_policy(stream_view),
                     thrust::device_pointer_cast(x),
                     thrust::device_pointer_cast(x + n),
                     thrust::make_constant_iterator(val),
                     thrust::device_pointer_cast(x),
                     thrust::multiplies<T>());
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 }
 
 template <typename T>
-void addv(size_t n, T val, T *x)
+void addv(size_t n, T val, T* x)
 {
-  cudaStream_t stream{nullptr};
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  thrust::transform(rmm::exec_policy(stream_view),
                     thrust::device_pointer_cast(x),
                     thrust::device_pointer_cast(x + n),
                     thrust::make_constant_iterator(val),
                     thrust::device_pointer_cast(x),
                     thrust::plus<T>());
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 }
 
 template <typename T>
-void fill(size_t n, T *x, T value)
+void fill(size_t n, T* x, T value)
 {
-  cudaStream_t stream{nullptr};
-  thrust::fill(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  thrust::fill(rmm::exec_policy(stream_view),
                thrust::device_pointer_cast(x),
                thrust::device_pointer_cast(x + n),
                value);
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 }
 
 template <typename T, typename M>
-void scatter(size_t n, T *src, T *dst, M *map)
+void scatter(size_t n, T* src, T* dst, M* map)
 {
-  cudaStream_t stream{nullptr};
-  thrust::scatter(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  thrust::scatter(rmm::exec_policy(stream_view),
                   thrust::device_pointer_cast(src),
                   thrust::device_pointer_cast(src + n),
                   thrust::device_pointer_cast(map),
                   thrust::device_pointer_cast(dst));
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 }
 
 template <typename T>
-void printv(size_t n, T *vec, int offset)
+void printv(size_t n, T* vec, int offset)
 {
   thrust::device_ptr<T> dev_ptr(vec);
   std::cout.precision(15);
@@ -212,13 +199,13 @@ void printv(size_t n, T *vec, int offset)
 }
 
 template <typename T>
-void copy(size_t n, T *x, T *res)
+void copy(size_t n, T* x, T* res)
 {
   thrust::device_ptr<T> dev_ptr(x);
   thrust::device_ptr<T> res_ptr(res);
-  cudaStream_t stream{nullptr};
-  thrust::copy_n(rmm::exec_policy(stream)->on(stream), dev_ptr, n, res_ptr);
-  CHECK_CUDA(stream);
+  rmm::cuda_stream_view stream_view;
+  thrust::copy_n(rmm::exec_policy(stream_view), dev_ptr, n, res_ptr);
+  CHECK_CUDA(stream_view.value());
 }
 
 template <typename T>
@@ -230,35 +217,35 @@ template <typename T>
 struct dangling_functor : public thrust::unary_function<T, T> {
   const T val;
   dangling_functor(T _val) : val(_val) {}
-  __host__ __device__ T operator()(const T &x) const { return val + x; }
+  __host__ __device__ T operator()(const T& x) const { return val + x; }
 };
 
 template <typename T>
-void update_dangling_nodes(size_t n, T *dangling_nodes, T damping_factor)
+void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor)
 {
-  cudaStream_t stream{nullptr};
-  thrust::transform_if(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  thrust::transform_if(rmm::exec_policy(stream_view),
                        thrust::device_pointer_cast(dangling_nodes),
                        thrust::device_pointer_cast(dangling_nodes + n),
                        thrust::device_pointer_cast(dangling_nodes),
                        dangling_functor<T>(1.0 - damping_factor),
                        is_zero<T>());
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 }
 
 // google matrix kernels
 template <typename IndexType, typename ValueType>
 __global__ void degree_coo(const IndexType n,
                            const IndexType e,
-                           const IndexType *ind,
-                           ValueType *degree)
+                           const IndexType* ind,
+                           ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
     atomicAdd(&degree[ind[i]], (ValueType)1.0);
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void flag_leafs_kernel(const size_t n, const IndexType *degree, ValueType *bookmark)
+__global__ void flag_leafs_kernel(const size_t n, const IndexType* degree, ValueType* bookmark)
 {
   for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     if (degree[i] == 0) bookmark[i] = 1.0;
@@ -267,19 +254,19 @@ __global__ void flag_leafs_kernel(const size_t n, const IndexType *degree, Value
 template <typename IndexType, typename ValueType>
 __global__ void degree_offsets(const IndexType n,
                                const IndexType e,
-                               const IndexType *ind,
-                               ValueType *degree)
+                               const IndexType* ind,
+                               ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     degree[i] += ind[i + 1] - ind[i];
 }
 
 template <typename FromType, typename ToType>
-__global__ void type_convert(FromType *array, int n)
+__global__ void type_convert(FromType* array, int n)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
     ToType val   = array[i];
-    ToType *vals = (ToType *)array;
+    ToType* vals = (ToType*)array;
     vals[i]      = val;
   }
 }
@@ -287,10 +274,10 @@ __global__ void type_convert(FromType *array, int n)
 template <typename IndexType, typename ValueType>
 __global__ void equi_prob3(const IndexType n,
                            const IndexType e,
-                           const IndexType *csrPtr,
-                           const IndexType *csrInd,
-                           ValueType *val,
-                           IndexType *degree)
+                           const IndexType* csrPtr,
+                           const IndexType* csrInd,
+                           ValueType* val,
+                           IndexType* degree)
 {
   int j, row, col;
   for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
@@ -306,10 +293,10 @@ __global__ void equi_prob3(const IndexType n,
 template <typename IndexType, typename ValueType>
 __global__ void equi_prob2(const IndexType n,
                            const IndexType e,
-                           const IndexType *csrPtr,
-                           const IndexType *csrInd,
-                           ValueType *val,
-                           IndexType *degree)
+                           const IndexType* csrPtr,
+                           const IndexType* csrInd,
+                           ValueType* val,
+                           IndexType* degree)
 {
   int row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row < n) {
@@ -327,13 +314,13 @@ __global__ void equi_prob2(const IndexType n,
 template <typename IndexType, typename ValueType>
 void HT_matrix_csc_coo(const IndexType n,
                        const IndexType e,
-                       const IndexType *csrPtr,
-                       const IndexType *csrInd,
-                       ValueType *val,
-                       ValueType *bookmark)
+                       const IndexType* csrPtr,
+                       const IndexType* csrInd,
+                       ValueType* val,
+                       ValueType* bookmark)
 {
-  cudaStream_t stream{nullptr};
-  rmm::device_vector<IndexType> degree(n, 0);
+  rmm::cuda_stream_view stream_view;
+  rmm::device_uvector<IndexType> degree(n, stream_view);
 
   dim3 nthreads, nblocks;
   nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
@@ -343,8 +330,8 @@ void HT_matrix_csc_coo(const IndexType n,
   nblocks.y  = 1;
   nblocks.z  = 1;
   degree_coo<IndexType, IndexType>
-    <<<nblocks, nthreads, 0, stream>>>(n, e, csrInd, degree.data().get());
-  CHECK_CUDA(stream);
+    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, e, csrInd, degree.data());
+  CHECK_CUDA(stream_view.value());
 
   int y      = 4;
   nthreads.x = 32 / y;
@@ -354,12 +341,12 @@ void HT_matrix_csc_coo(const IndexType n,
   nblocks.y  = 1;
   nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS);  // 1;
   equi_prob3<IndexType, ValueType>
-    <<<nblocks, nthreads, 0, stream>>>(n, e, csrPtr, csrInd, val, degree.data().get());
-  CHECK_CUDA(stream);
+    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, e, csrPtr, csrInd, val, degree.data());
+  CHECK_CUDA(stream_view.value());
 
   ValueType a = 0.0;
   fill(n, bookmark, a);
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 
   nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
   nthreads.y = 1;
@@ -368,100 +355,12 @@ void HT_matrix_csc_coo(const IndexType n,
   nblocks.y  = 1;
   nblocks.z  = 1;
   flag_leafs_kernel<IndexType, ValueType>
-    <<<nblocks, nthreads, 0, stream>>>(n, degree.data().get(), bookmark);
-  CHECK_CUDA(stream);
-}
-
-template <typename IndexType, typename ValueType>
-__global__ void permute_vals_kernel(const IndexType e,
-                                    IndexType *perm,
-                                    ValueType *in,
-                                    ValueType *out)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-    out[i] = in[perm[i]];
-}
-
-template <typename IndexType, typename ValueType>
-void permute_vals(
-  const IndexType e, IndexType *perm, ValueType *in, ValueType *out, cudaStream_t stream = nullptr)
-{
-  int nthreads = min(e, CUDA_MAX_KERNEL_THREADS);
-  int nblocks  = min((e + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS);
-  permute_vals_kernel<<<nblocks, nthreads, 0, stream>>>(e, perm, in, out);
-}
-
-// This will remove duplicate along with sorting
-// This will sort the COO Matrix, row will be sorted and each column of same row will be sorted.
-template <typename IndexType, typename ValueType, typename SizeT>
-void remove_duplicate(
-  IndexType *src, IndexType *dest, ValueType *val, SizeT &nnz, cudaStream_t stream = nullptr)
-{
-  if (val != NULL) {
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
-                               thrust::raw_pointer_cast(val),
-                               thrust::raw_pointer_cast(val) + nnz,
-                               thrust::make_zip_iterator(thrust::make_tuple(
-                                 thrust::raw_pointer_cast(src), thrust::raw_pointer_cast(dest))));
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
-                               thrust::raw_pointer_cast(dest),
-                               thrust::raw_pointer_cast(dest + nnz),
-                               thrust::make_zip_iterator(thrust::make_tuple(
-                                 thrust::raw_pointer_cast(src), thrust::raw_pointer_cast(val))));
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
-                               thrust::raw_pointer_cast(src),
-                               thrust::raw_pointer_cast(src + nnz),
-                               thrust::make_zip_iterator(thrust::make_tuple(
-                                 thrust::raw_pointer_cast(dest), thrust::raw_pointer_cast(val))));
-
-    typedef thrust::tuple<IndexType *, ValueType *> IteratorTuple;
-    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-    typedef thrust::tuple<IndexType *, ZipIterator> ZipIteratorTuple;
-    typedef thrust::zip_iterator<ZipIteratorTuple> ZipZipIterator;
-
-    ZipZipIterator newEnd =
-      thrust::unique(rmm::exec_policy(stream)->on(stream),
-                     thrust::make_zip_iterator(thrust::make_tuple(
-                       thrust::raw_pointer_cast(src),
-                       thrust::make_zip_iterator(thrust::make_tuple(
-                         thrust::raw_pointer_cast(dest), thrust::raw_pointer_cast(val))))),
-                     thrust::make_zip_iterator(thrust::make_tuple(
-                       thrust::raw_pointer_cast(src + nnz),
-                       thrust::make_zip_iterator(thrust::make_tuple(dest + nnz, val + nnz)))));
-
-    ZipIteratorTuple endTuple = newEnd.get_iterator_tuple();
-    IndexType *row_end        = thrust::get<0>(endTuple);
-
-    nnz = ((size_t)row_end - (size_t)src) / sizeof(IndexType);
-  } else {
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
-                               thrust::raw_pointer_cast(dest),
-                               thrust::raw_pointer_cast(dest + nnz),
-                               thrust::raw_pointer_cast(src));
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
-                               thrust::raw_pointer_cast(src),
-                               thrust::raw_pointer_cast(src + nnz),
-                               thrust::raw_pointer_cast(dest));
-
-    typedef thrust::tuple<IndexType *, IndexType *> IteratorTuple;
-    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-    ZipIterator newEnd =
-      thrust::unique(rmm::exec_policy(stream)->on(stream),
-                     thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src),
-                                                                  thrust::raw_pointer_cast(dest))),
-                     thrust::make_zip_iterator(thrust::make_tuple(
-                       thrust::raw_pointer_cast(src + nnz), thrust::raw_pointer_cast(dest + nnz))));
-
-    IteratorTuple endTuple = newEnd.get_iterator_tuple();
-    IndexType *row_end     = thrust::get<0>(endTuple);
-
-    nnz = ((size_t)row_end - (size_t)src) / sizeof(IndexType);
-  }
+    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, degree.data(), bookmark);
+  CHECK_CUDA(stream_view.value());
 }
 
 template <typename offsets_t, typename index_t>
-__global__ void offsets_to_indices_kernel(const offsets_t *offsets, index_t v, index_t *indices)
+__global__ void offsets_to_indices_kernel(const offsets_t* offsets, index_t v, index_t* indices)
 {
   auto tid{threadIdx.x};
   auto ctaStart{blockIdx.x};
@@ -478,7 +377,7 @@ __global__ void offsets_to_indices_kernel(const offsets_t *offsets, index_t v, i
 }
 
 template <typename offsets_t, typename index_t>
-void offsets_to_indices(const offsets_t *offsets, index_t v, index_t *indices)
+void offsets_to_indices(const offsets_t* offsets, index_t v, index_t* indices)
 {
   cudaStream_t stream{nullptr};
   index_t nthreads = min(v, (index_t)CUDA_MAX_KERNEL_THREADS);
@@ -488,7 +387,7 @@ void offsets_to_indices(const offsets_t *offsets, index_t v, index_t *indices)
 }
 
 template <typename IndexType>
-void sequence(IndexType n, IndexType *vec, IndexType init = 0)
+void sequence(IndexType n, IndexType* vec, IndexType init = 0)
 {
   thrust::sequence(
     thrust::device, thrust::device_pointer_cast(vec), thrust::device_pointer_cast(vec + n), init);
@@ -496,16 +395,16 @@ void sequence(IndexType n, IndexType *vec, IndexType init = 0)
 }
 
 template <typename DistType>
-bool has_negative_val(DistType *arr, size_t n)
+bool has_negative_val(DistType* arr, size_t n)
 {
   // custom kernel with boolean bitwise reduce may be
   // faster.
-  cudaStream_t stream{nullptr};
-  DistType result = *thrust::min_element(rmm::exec_policy(stream)->on(stream),
+  rmm::cuda_stream_view stream_view;
+  DistType result = *thrust::min_element(rmm::exec_policy(stream_view),
                                          thrust::device_pointer_cast(arr),
                                          thrust::device_pointer_cast(arr + n));
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream_view.value());
 
   return (result < 0);
 }
diff --git a/cpp/src/utilities/heap.cuh b/cpp/src/utilities/heap.cuh
deleted file mode 100644
index 0747a658324..00000000000
--- a/cpp/src/utilities/heap.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-// -*-c++-*-
-
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Utilities to treat array as a heap
-// Author: Chuck Hastings charlesh@nvidia.com
-
-#ifndef HEAP_H
-#define HEAP_H
-
-namespace cugraph {
-namespace detail {
-
-namespace heap {
-/*
- *  Our goal here is to treat a C-style array indexed
- *  from 0 to n-1 as a heap.  The heap is a binary tress
- *  structure where the root of each tree is the smallest
- *  (or largest) value in that subtree.
- *
- *  This is a completely serial implementation.  The intention
- *  from a parallelism perspective would be to use this on
- *  a block of data assigned to a particular GPU (or CPU) thread.
- *
- *  These functions will allow you to use an existing
- *  c-style array (host or device side) and manipulate
- *  it as a heap.
- *
- *  Note, the heap will be represented like this - the
- *  shape indicates the binary tree structure, the element
- *  indicates the index of the array that is associated
- *  with the element.  This diagram will help understand
- *  the parent/child calculations defined below.
- *
- *                    0
- *              1           2
- *           3     4     5     6
- *          7  8  9 10 11 12 13 14
- *
- *   So element 0 is the root of the tree, element 1 is the
- *   left child of 0, element 2 is the right child of 0, etc.
- */
-
-namespace detail {
-/**
- * @brief Identify the parent index of the specified index.
- *        NOTE: This function does no bounds checking, so
- *        the parent of 0 is 0.
- *
- *   See the above documentation for a picture to describe
- *   the tree.
- *
- *   IndexT is a templated integer type of the index
- *
- * @param[in]  index - the current array index
- * @return     the index of the parent of the current index
- */
-template <typename IndexT>
-inline IndexT __host__ __device__ parent(IndexT index)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  return ((index + 1) / 2) - 1;
-}
-
-/**
- * @brief Identify the left child index of the specified index.
- *        NOTE: This function does no bounds checking, so
- *        the left child computed might be out of bounds.
- *
- *   See the above documentation for a picture to describe
- *   the tree.
- *
- *   IndexT is a templated integer type of the index
- *
- * @param[in]  index - the current array index
- * @return     the index of the left child of the current index
- */
-template <typename IndexT>
-inline IndexT __host__ __device__ left_child(IndexT index)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  return ((index + 1) * 2 - 1);
-}
-
-/**
- * @brief Identify the right child index of the specified index.
- *        NOTE: This function does no bounds checking, so
- *        the right child computed might be out of bounds.
- *
- *   See the above documentation for a picture to describe
- *   the tree.
- *
- *   IndexT is a templated integer type of the index
- *
- * @param[in]  index - the current array index
- * @return     the index of the right child of the current index
- */
-template <typename IndexT>
-inline IndexT __host__ __device__ right_child(IndexT index)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  return (index + 1) * 2;
-}
-}  // namespace detail
-
-/**
- * @brief Reorder an existing array of elements into a heap
- *
- *   ArrayT is a templated type of the array elements
- *   IndexT is a templated integer type of the index
- *   CompareT is a templated compare function
- *
- * @param[in, out]   array   - the existing array
- * @param[in]        size    - the number of elements in the existing array
- * @param[in]        compare - the comparison function to use
- *
- */
-template <typename ArrayT, typename IndexT, typename CompareT>
-inline void __host__ __device__ heapify(ArrayT *array, IndexT size, CompareT compare)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  //
-  // We want to order ourselves as a heap.  This is accomplished by starting
-  // at the end and for each element, compare with its parent and
-  // swap if necessary.  We repeat this until there are no more swaps
-  // (should take no more than log2(size) iterations).
-  //
-  IndexT count_swaps = 1;
-  while (count_swaps > 0) {
-    count_swaps = 0;
-    for (IndexT i = size - 1; i > 0; --i) {
-      IndexT p = detail::parent(i);
-
-      if (compare(array[i], array[p])) {
-        thrust::swap(array[i], array[p]);
-        ++count_swaps;
-      }
-    }
-  }
-}
-
-/**
- * @brief Pop the top element off of the heap.  Note that the caller
- *        should decrement the size - the last element in the
- *        array is no longer used.
- *
- *   ArrayT is a templated type of the array elements
- *   IndexT is a templated integer type of the index
- *   CompareT is a templated compare function
- *
- * @return - the top of the heap.
- */
-template <typename ArrayT, typename IndexT, typename CompareT>
-inline ArrayT __host__ __device__ heap_pop(ArrayT *array, IndexT size, CompareT compare)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  //
-  //  Swap the top of the array with the last element
-  //
-  --size;
-  thrust::swap(array[0], array[size]);
-
-  //
-  //  Now top element is no longer the smallest (largest), so we need
-  //  to sift it down to the proper location.
-  //
-  for (IndexT i = 0; i < size;) {
-    IndexT lc      = detail::left_child(i);
-    IndexT rc      = detail::right_child(i);
-    IndexT smaller = i;
-
-    //
-    //  We can go out of bounds, let's check the simple cases
-    //
-    if (rc < size) {
-      //
-      //  Both children exist in tree, pick the smaller (lerger)
-      //  one.
-      //
-      smaller = (compare(array[lc], array[rc])) ? lc : rc;
-    } else if (lc < size) {
-      smaller = lc;
-    }
-
-    if ((smaller != i) && (compare(array[smaller], array[i]))) {
-      thrust::swap(array[i], array[smaller]);
-      i = smaller;
-    } else {
-      //
-      //  If we don't swap then we can stop checking, break out of the loop
-      //
-      i = size;
-    }
-  }
-
-  return array[size];
-}
-}  // namespace heap
-
-}  // namespace detail
-}  // namespace cugraph
-
-#endif
diff --git a/cpp/src/utilities/high_res_timer.hpp b/cpp/src/utilities/high_res_timer.hpp
index f2d6bc6e13f..cf265991f21 100644
--- a/cpp/src/utilities/high_res_timer.hpp
+++ b/cpp/src/utilities/high_res_timer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,12 @@
 #include <ctime>
 #include <iostream>
 #include <map>
+#include <sstream>
+#include <stdexcept>
 #include <string>
 
+//#define TIMING
+
 class HighResTimer {
  public:
   HighResTimer() : timers() {}
@@ -50,10 +54,23 @@ class HighResTimer {
     it->second.second += stop_time.tv_sec * 1000000000 + stop_time.tv_nsec;
   }
 
+  double get_average_runtime(std::string const& label)
+  {
+    auto it = timers.find(label);
+    if (it != timers.end()) {
+      return (static_cast<double>(it->second.second) / (1000000.0 * it->second.first));
+    } else {
+      std::stringstream ss;
+      ss << "ERROR: timing label: " << label << "not found.";
+
+      throw std::runtime_error(ss.str());
+    }
+  }
+
   //
   //  Add display functions... specific label or entire structure
   //
-  void display(std::ostream &os)
+  void display(std::ostream& os)
   {
     os << "Timer Results (in ms):" << std::endl;
     for (auto it = timers.begin(); it != timers.end(); ++it) {
@@ -63,7 +80,7 @@ class HighResTimer {
     }
   }
 
-  void display(std::ostream &os, std::string label)
+  void display(std::ostream& os, std::string label)
   {
     auto it = timers.find(label);
     os << it->first << " called " << it->second.first
@@ -71,7 +88,7 @@ class HighResTimer {
        << std::endl;
   }
 
-  void display_and_clear(std::ostream &os)
+  void display_and_clear(std::ostream& os)
   {
     os << "Timer Results (in ms):" << std::endl;
     for (auto it = timers.begin(); it != timers.end(); ++it) {
diff --git a/cpp/src/utilities/host_barrier.cpp b/cpp/src/utilities/host_barrier.cpp
new file mode 100644
index 00000000000..659e4038c67
--- /dev/null
+++ b/cpp/src/utilities/host_barrier.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cugraph/utilities/host_barrier.hpp>
+
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+// FIXME: a temporary hack till UCC is integrated into RAFT (so we can use UCC barrier for DASK and
+// MPI barrier for MPI)
+void host_barrier(raft::comms::comms_t const& comm, rmm::cuda_stream_view stream_view)
+{
+  stream_view.synchronize();
+
+  auto const comm_size = comm.get_size();
+  auto const comm_rank = comm.get_rank();
+
+  // k-tree barrier
+
+  int constexpr k = 2;
+  static_assert(k >= 2);
+  std::vector<raft::comms::request_t> requests(k - 1);
+  std::vector<std::byte> dummies(k - 1);
+
+  // up
+
+  int mod = 1;
+  while (mod < comm_size) {
+    if (comm_rank % mod == 0) {
+      auto level_rank = comm_rank / mod;
+      if (level_rank % k == 0) {
+        auto num_irecvs = 0;
+        ;
+        for (int i = 1; i < k; ++i) {
+          auto src_rank = (level_rank + i) * mod;
+          if (src_rank < comm_size) {
+            comm.irecv(dummies.data() + (i - 1),
+                       sizeof(std::byte),
+                       src_rank,
+                       int{0} /* tag */,
+                       requests.data() + (i - 1));
+            ++num_irecvs;
+          }
+        }
+        comm.waitall(num_irecvs, requests.data());
+      } else {
+        comm.isend(dummies.data(),
+                   sizeof(std::byte),
+                   (level_rank - (level_rank % k)) * mod,
+                   int{0} /* tag */,
+                   requests.data());
+        comm.waitall(1, requests.data());
+      }
+    }
+    mod *= k;
+  }
+
+  // down
+
+  mod /= k;
+  while (mod >= 1) {
+    if (comm_rank % mod == 0) {
+      auto level_rank = comm_rank / mod;
+      if (level_rank % k == 0) {
+        auto num_isends = 0;
+        for (int i = 1; i < k; ++i) {
+          auto dst_rank = (level_rank + i) * mod;
+          if (dst_rank < comm_size) {
+            comm.isend(dummies.data() + (i - 1),
+                       sizeof(std::byte),
+                       dst_rank,
+                       int{0} /* tag */,
+                       requests.data() + (i - 1));
+            ++num_isends;
+          }
+        }
+        comm.waitall(num_isends, requests.data());
+      } else {
+        comm.irecv(dummies.data(),
+                   sizeof(std::byte),
+                   (level_rank - (level_rank % k)) * mod,
+                   int{0} /* tag */,
+                   requests.data());
+        comm.waitall(1, requests.data());
+      }
+    }
+    mod /= k;
+  }
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu
new file mode 100644
index 00000000000..765cccc1916
--- /dev/null
+++ b/cpp/src/utilities/path_retrieval.cu
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/device_uvector.hpp>
+
+#include <raft/handle.hpp>
+
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/path_retrieval.hpp>
+
+namespace cugraph {
+namespace detail {
+
+template <typename vertex_t, typename weight_t>
+__global__ void get_traversed_cost_kernel(vertex_t const* vertices,
+                                          vertex_t const* preds,
+                                          vertex_t const* vtx_map,
+                                          weight_t const* info_weights,
+                                          weight_t* out,
+                                          vertex_t stop_vertex,
+                                          vertex_t num_vertices)
+{
+  for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices;
+       i += gridDim.x * blockDim.x) {
+    weight_t sum  = info_weights[i];
+    vertex_t pred = preds[i];
+    while (pred != stop_vertex) {
+      vertex_t pos = vtx_map[pred];
+      sum += info_weights[pos];
+      pred = preds[pos];
+    }
+    out[i] = sum;
+  }
+}
+
+template <typename vertex_t, typename weight_t>
+void get_traversed_cost_impl(raft::handle_t const& handle,
+                             vertex_t const* vertices,
+                             vertex_t const* preds,
+                             weight_t const* info_weights,
+                             weight_t* out,
+                             vertex_t stop_vertex,
+                             vertex_t num_vertices)
+{
+  auto stream          = handle.get_stream();
+  vertex_t max_blocks  = handle.get_device_properties().maxGridSize[0];
+  vertex_t max_threads = handle.get_device_properties().maxThreadsPerBlock;
+
+  dim3 nthreads, nblocks;
+  nthreads.x = std::min<vertex_t>(num_vertices, max_threads);
+  nthreads.y = 1;
+  nthreads.z = 1;
+  nblocks.x  = std::min<vertex_t>((num_vertices + nthreads.x - 1) / nthreads.x, max_blocks);
+  nblocks.y  = 1;
+  nblocks.z  = 1;
+
+  rmm::device_uvector<vertex_t> vtx_map_v(num_vertices, stream);
+  rmm::device_uvector<vertex_t> vtx_keys_v(num_vertices, stream);
+  vertex_t* vtx_map  = vtx_map_v.data();
+  vertex_t* vtx_keys = vtx_keys_v.data();
+  raft::copy(vtx_keys, vertices, num_vertices, stream);
+
+  thrust::sequence(rmm::exec_policy(stream)->on(stream), vtx_map, vtx_map + num_vertices);
+
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream)->on(stream), vtx_keys, vtx_keys + num_vertices, vtx_map);
+
+  get_traversed_cost_kernel<<<nblocks, nthreads>>>(
+    vertices, preds, vtx_map, info_weights, out, stop_vertex, num_vertices);
+}
+}  // namespace detail
+
+template <typename vertex_t, typename weight_t>
+void get_traversed_cost(raft::handle_t const& handle,
+                        vertex_t const* vertices,
+                        vertex_t const* preds,
+                        weight_t const* info_weights,
+                        weight_t* out,
+                        vertex_t stop_vertex,
+                        vertex_t num_vertices)
+{
+  CUGRAPH_EXPECTS(num_vertices > 0, "num_vertices should be strictly positive");
+  CUGRAPH_EXPECTS(out != nullptr, "out should be of size num_vertices");
+  cugraph::detail::get_traversed_cost_impl(
+    handle, vertices, preds, info_weights, out, stop_vertex, num_vertices);
+}
+
+template void get_traversed_cost<int32_t, float>(raft::handle_t const& handle,
+                                                 int32_t const* vertices,
+                                                 int32_t const* preds,
+                                                 float const* info_weights,
+                                                 float* out,
+                                                 int32_t stop_vertex,
+                                                 int32_t num_vertices);
+
+template void get_traversed_cost<int32_t, double>(raft::handle_t const& handle,
+                                                  int32_t const* vertices,
+                                                  int32_t const* preds,
+                                                  double const* info_weights,
+                                                  double* out,
+                                                  int32_t stop_vertex,
+                                                  int32_t num_vertices);
+
+template void get_traversed_cost<int64_t, float>(raft::handle_t const& handle,
+                                                 int64_t const* vertices,
+                                                 int64_t const* preds,
+                                                 float const* info_weights,
+                                                 float* out,
+                                                 int64_t stop_vertex,
+                                                 int64_t num_vertices);
+
+template void get_traversed_cost<int64_t, double>(raft::handle_t const& handle,
+                                                  int64_t const* vertices,
+                                                  int64_t const* preds,
+                                                  double const* info_weights,
+                                                  double* out,
+                                                  int64_t stop_vertex,
+                                                  int64_t num_vertices);
+}  // namespace cugraph
diff --git a/cpp/src/utilities/spmv_1D.cu b/cpp/src/utilities/spmv_1D.cu
index 8a7378e69d3..b4db219fb89 100644
--- a/cpp/src/utilities/spmv_1D.cu
+++ b/cpp/src/utilities/spmv_1D.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,13 +19,13 @@
 namespace cugraph {
 namespace mg {
 template <typename vertex_t, typename edge_t, typename weight_t>
-MGcsrmv<vertex_t, edge_t, weight_t>::MGcsrmv(raft::handle_t const &handle,
-                                             vertex_t *local_vertices,
-                                             vertex_t *part_off,
-                                             edge_t *off,
-                                             vertex_t *ind,
-                                             weight_t *val,
-                                             weight_t *x)
+MGcsrmv<vertex_t, edge_t, weight_t>::MGcsrmv(raft::handle_t const& handle,
+                                             vertex_t* local_vertices,
+                                             vertex_t* part_off,
+                                             edge_t* off,
+                                             vertex_t* ind,
+                                             weight_t* val,
+                                             weight_t* x)
   : handle_(handle),
     local_vertices_(local_vertices),
     part_off_(part_off),
@@ -49,7 +49,7 @@ MGcsrmv<vertex_t, edge_t, weight_t>::~MGcsrmv()
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void MGcsrmv<vertex_t, edge_t, weight_t>::run(weight_t *x)
+void MGcsrmv<vertex_t, edge_t, weight_t>::run(weight_t* x)
 {
   using namespace raft::matrix;
 
@@ -72,7 +72,7 @@ void MGcsrmv<vertex_t, edge_t, weight_t>::run(weight_t *x)
 
   auto stream = handle_.get_stream();
 
-  auto const &comm{handle_.get_comms()};  // local
+  auto const& comm{handle_.get_comms()};  // local
 
   std::vector<size_t> recvbuf(comm.get_size());
   std::vector<size_t> displs(comm.get_size());
diff --git a/cpp/src/utilities/spmv_1D.cuh b/cpp/src/utilities/spmv_1D.cuh
index 81466595c19..31af0c75585 100644
--- a/cpp/src/utilities/spmv_1D.cuh
+++ b/cpp/src/utilities/spmv_1D.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 #include <rmm/thrust_rmm_allocator.h>
+#include <cugraph/utilities/error.hpp>
 #include <raft/handle.hpp>
-#include "utilities/error.hpp"
 
 namespace cugraph {
 namespace mg {
diff --git a/cpp/src/visitors/bfs_visitor.cpp b/cpp/src/visitors/bfs_visitor.cpp
new file mode 100644
index 00000000000..672cc35f00b
--- /dev/null
+++ b/cpp/src/visitors/bfs_visitor.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/visitors/bfs_visitor.hpp>
+
+namespace cugraph {
+namespace visitors {
+
+//
+// wrapper code:
+//
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+void bfs_visitor<vertex_t,
+                 edge_t,
+                 weight_t,
+                 st,  // FIXME: can only be false for BFS
+                 mg,
+                 std::enable_if_t<is_candidate<vertex_t, edge_t, weight_t>::value>>::
+  visit_graph(graph_envelope_t::base_graph_t const& graph)
+{
+  // Note: this must be called only on:
+  // graph_view_t<vertex_t, edge_t, weight_t, false, mg>
+  //
+  if constexpr (st == false) {
+    // unless algorithms only call virtual graph methods
+    // under the hood, the algos require this conversion:
+    //
+    graph_t<vertex_t, edge_t, weight_t, false, mg> const* p_g =
+      static_cast<graph_t<vertex_t, edge_t, weight_t, st, mg> const*>(&graph);
+
+    auto gview = p_g->view();
+
+    auto const& v_args = ep_.get_args();
+
+    // unpack bfs() args:
+    //
+    assert(v_args.size() == 7);
+
+    // cnstr. args unpacking:
+    //
+    raft::handle_t const& handle = *static_cast<raft::handle_t const*>(v_args[0]);
+
+    vertex_t* p_d_dist = static_cast<vertex_t*>(v_args[1]);
+
+    vertex_t* p_d_predec = static_cast<vertex_t*>(v_args[2]);
+
+    vertex_t src_v = *static_cast<vertex_t*>(v_args[3]);
+
+    bool dir_opt = *static_cast<bool*>(v_args[4]);
+
+    auto depth_l = *static_cast<vertex_t*>(v_args[5]);
+
+    bool check = *static_cast<bool*>(v_args[6]);
+
+    // call algorithm
+    // (no result; void)
+    //
+    bfs(handle, gview, p_d_dist, p_d_predec, src_v, dir_opt, depth_l, check);
+  } else {
+    CUGRAPH_FAIL("Unsupported BFS algorithm (store_transposed == true).");
+  }
+}
+
+// EIDir's:
+//
+template class bfs_visitor<int, int, float, true, true>;
+template class bfs_visitor<int, int, double, true, true>;
+
+template class bfs_visitor<int, int, float, true, false>;
+template class bfs_visitor<int, int, double, true, false>;
+
+template class bfs_visitor<int, int, float, false, true>;
+template class bfs_visitor<int, int, double, false, true>;
+
+template class bfs_visitor<int, int, float, false, false>;
+template class bfs_visitor<int, int, double, false, false>;
+
+//------
+
+template class bfs_visitor<int, long, float, true, true>;
+template class bfs_visitor<int, long, double, true, true>;
+
+template class bfs_visitor<int, long, float, true, false>;
+template class bfs_visitor<int, long, double, true, false>;
+
+template class bfs_visitor<int, long, float, false, true>;
+template class bfs_visitor<int, long, double, false, true>;
+
+template class bfs_visitor<int, long, float, false, false>;
+template class bfs_visitor<int, long, double, false, false>;
+
+//------
+
+template class bfs_visitor<long, long, float, true, true>;
+template class bfs_visitor<long, long, double, true, true>;
+
+template class bfs_visitor<long, long, float, true, false>;
+template class bfs_visitor<long, long, double, true, false>;
+
+template class bfs_visitor<long, long, float, false, true>;
+template class bfs_visitor<long, long, double, false, true>;
+
+template class bfs_visitor<long, long, float, false, false>;
+template class bfs_visitor<long, long, double, false, false>;
+
+}  // namespace visitors
+
+namespace api {
+using namespace cugraph::visitors;
+// wrapper:
+// macro option: MAKE_WRAPPER(bfs)
+//
+return_t bfs(graph_envelope_t const& g, erased_pack_t& ep)
+{
+  auto p_visitor = g.factory()->make_bfs_visitor(ep);
+
+  g.apply(*p_visitor);
+
+  return_t ret{p_visitor->get_result()};
+
+  return ret;  // RVO-ed;
+}
+
+}  // namespace api
+}  // namespace cugraph
diff --git a/cpp/src/visitors/graph_envelope.cpp b/cpp/src/visitors/graph_envelope.cpp
new file mode 100755
index 00000000000..927c5060b1e
--- /dev/null
+++ b/cpp/src/visitors/graph_envelope.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#include <cugraph/visitors/cascaded_dispatch.hpp>
+#include <cugraph/visitors/graph_envelope.hpp>
+
+namespace cugraph {
+namespace visitors {
+
+// call cascaded dispatcher with factory and erased_pack_t
+//
+graph_envelope_t::graph_envelope_t(DTypes vertex_tid,
+                                   DTypes edge_tid,
+                                   DTypes weight_tid,
+                                   bool st,
+                                   bool mg,
+                                   GTypes graph_tid,
+                                   erased_pack_t& ep)
+  : p_impl_fact_(vertex_dispatcher(vertex_tid, edge_tid, weight_tid, st, mg, graph_tid, ep))
+{
+}
+
+template class graph_factory_t<graph_t<int, int, float, true, true>>;
+template class graph_factory_t<graph_t<int, int, double, true, true>>;
+
+template class graph_factory_t<graph_t<int, int, float, true, false>>;
+template class graph_factory_t<graph_t<int, int, double, true, false>>;
+
+template class graph_factory_t<graph_t<int, int, float, false, true>>;
+template class graph_factory_t<graph_t<int, int, double, false, true>>;
+
+template class graph_factory_t<graph_t<int, int, float, false, false>>;
+template class graph_factory_t<graph_t<int, int, double, false, false>>;
+
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/src/visitors/visitors_factory.cpp b/cpp/src/visitors/visitors_factory.cpp
new file mode 100644
index 00000000000..c4238166c6a
--- /dev/null
+++ b/cpp/src/visitors/visitors_factory.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#include <cugraph/visitors/bfs_visitor.hpp>
+#include <cugraph/visitors/graph_envelope.hpp>
+
+namespace cugraph {
+namespace visitors {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+std::unique_ptr<visitor_t>
+dependent_factory_t<vertex_t,
+                    edge_t,
+                    weight_t,
+                    st,
+                    mg,
+                    std::enable_if_t<is_candidate<vertex_t, edge_t, weight_t>::value>>::
+  make_louvain_visitor(erased_pack_t& ep) const
+{
+  /// return std::unique_ptr<visitor_t>(
+  ///  static_cast<visitor_t*>(new louvain_visitor<vertex_t, edge_t, weight_t, st, mg>(ep)));
+
+  return nullptr;  // for now...
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool st, bool mg>
+std::unique_ptr<visitor_t>
+dependent_factory_t<vertex_t,
+                    edge_t,
+                    weight_t,
+                    st,
+                    mg,
+                    std::enable_if_t<is_candidate<vertex_t, edge_t, weight_t>::value>>::
+  make_bfs_visitor(erased_pack_t& ep) const
+{
+  // return nullptr;  // for now...
+  return std::make_unique<bfs_visitor<vertex_t, edge_t, weight_t, st, mg>>(ep);
+}
+
+// EIDir's:
+//
+template class dependent_factory_t<int, int, float, true, true>;
+template class dependent_factory_t<int, int, double, true, true>;
+
+template class dependent_factory_t<int, int, float, true, false>;
+template class dependent_factory_t<int, int, double, true, false>;
+
+template class dependent_factory_t<int, int, float, false, true>;
+template class dependent_factory_t<int, int, double, false, true>;
+
+template class dependent_factory_t<int, int, float, false, false>;
+template class dependent_factory_t<int, int, double, false, false>;
+
+//------
+
+template class dependent_factory_t<int, long, float, true, true>;
+template class dependent_factory_t<int, long, double, true, true>;
+
+template class dependent_factory_t<int, long, float, true, false>;
+template class dependent_factory_t<int, long, double, true, false>;
+
+template class dependent_factory_t<int, long, float, false, true>;
+template class dependent_factory_t<int, long, double, false, true>;
+
+template class dependent_factory_t<int, long, float, false, false>;
+template class dependent_factory_t<int, long, double, false, false>;
+
+//------
+
+template class dependent_factory_t<long, long, float, true, true>;
+template class dependent_factory_t<long, long, double, true, true>;
+
+template class dependent_factory_t<long, long, float, true, false>;
+template class dependent_factory_t<long, long, double, true, false>;
+
+template class dependent_factory_t<long, long, float, false, true>;
+template class dependent_factory_t<long, long, double, false, true>;
+
+template class dependent_factory_t<long, long, float, false, false>;
+template class dependent_factory_t<long, long, double, false, false>;
+
+// Either use EIDir or specialization, can't have both;
+// Prefer specialization when EIdir's are not enough
+// because of cascaded-dispatcher exhaustive instantiations
+// In this case EIDir above are enough;
+}  // namespace visitors
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 593c36359e2..2d13b46ac61 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 ﻿#=============================================================================
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,35 +16,155 @@
 #
 #=============================================================================
 
+###################################################################################################
+# - common test utils -----------------------------------------------------------------------------
+
+add_library(cugraphtestutil STATIC
+            utilities/matrix_market_file_utilities.cu
+            utilities/rmat_utilities.cpp
+            utilities/thrust_wrapper.cu
+            utilities/misc_utilities.cpp
+            components/wcc_graphs.cu
+            ../../thirdparty/mmio/mmio.c)
+
+target_compile_options(cugraphtestutil
+            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUGRAPH_CXX_FLAGS}>"
+                   "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUGRAPH_CUDA_FLAGS}>>"
+)
+
+set_property(TARGET cugraphtestutil PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(cugraphtestutil
+    PUBLIC
+        "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio"
+        "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${CUGRAPH_SOURCE_DIR}/src"
+)
+
+target_link_libraries(cugraphtestutil
+    PUBLIC
+        cugraph
+        cuco::cuco
+        NCCL::NCCL
+)
+
+
+add_library(cugraphmgtestutil STATIC
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/device_comm_wrapper.cu")
+
+set_property(TARGET cugraphmgtestutil PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(cugraphmgtestutil
+    PRIVATE
+    "${CUB_INCLUDE_DIR}"
+    "${THRUST_INCLUDE_DIR}"
+    "${CUCO_INCLUDE_DIR}"
+    "${LIBCUDACXX_INCLUDE_DIR}"
+    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+    "${RMM_INCLUDE}"
+    "${NCCL_INCLUDE_DIRS}"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+    "${RAFT_DIR}/cpp/include"
+)
+
+target_link_libraries(cugraphmgtestutil cugraph)
+
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 
-function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
-    add_executable(${CMAKE_TEST_NAME}
-        ${CMAKE_TEST_SRC})
+function(ConfigureTest CMAKE_TEST_NAME)
+    add_executable(${CMAKE_TEST_NAME} ${ARGN})
 
-    target_include_directories(${CMAKE_TEST_NAME}
+    target_link_libraries(${CMAKE_TEST_NAME}
         PRIVATE
-        "${CUB_INCLUDE_DIR}"
-        "${THRUST_INCLUDE_DIR}"
-	"${CUCO_INCLUDE_DIR}"
-	"${LIBCUDACXX_INCLUDE_DIR}"
-        "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
-        "${GTEST_INCLUDE_DIR}"
-        "${RMM_INCLUDE}"
-        "${CUDF_INCLUDE}"
-        "${CUDF_INCLUDE}/libcudf/libcudacxx"
-        "${NCCL_INCLUDE_DIRS}"
-        "${CMAKE_SOURCE_DIR}/../thirdparty/mmio"
-        "${CMAKE_SOURCE_DIR}/include"
-        "${CMAKE_SOURCE_DIR}/src"
-        "${CMAKE_CURRENT_SOURCE_DIR}"
-        "${RAFT_DIR}/cpp/include"
+            cugraphtestutil
+            cugraph
+            GTest::gmock
+            GTest::gmock_main
+            GTest::gtest
+            GTest::gtest_main
+            NCCL::NCCL
+            CUDA::cublas
+            CUDA::cusparse
+            CUDA::cusolver
+            CUDA::curand
     )
 
+    if(OpenMP_CXX_FOUND)
+        target_link_libraries(${CMAKE_TEST_NAME} PRIVATE
+###################################################################################################
+###   Use ${OpenMP_CXX_LIB_NAMES} instead of OpenMP::OpenMP_CXX to avoid the following warnings.
+###
+###   Cannot generate a safe runtime search path for target TARGET_NAME
+###   because files in some directories may conflict with libraries in implicit
+###   directories:
+###   ...
+###
+###   libgomp.so is included in the conda base environment and copied to every new conda
+###   environment. If a full file path is provided (e.g ${CUDF_LIBRARY}), cmake
+###   extracts the directory path and adds the directory path to BUILD_RPATH (if BUILD_RPATH is not
+###   disabled).
+###
+###   cmake maintains a system specific implicit directories (e.g. /lib, /lib/x86_64-linux-gnu,
+###   /lib32, /lib32/x86_64-linux-gnu, /lib64, /lib64/x86_64-linux-gnu, /usr/lib,
+###   /usr/lib/gcc/x86_64-linux-gnu/7, /usr/lib/x86_64-linux-gnu, /usr/lib32,
+###   /usr/lib32/x86_64-linux-gnu, /usr/lib64, /usr/lib64/x86_64-linux-gnu,
+###   /usr/local/cuda-10.0/lib64", /usr/local/cuda-10.0/lib64/stubs).
+###
+###   If a full path to libgomp.so is provided (which is the case with OpenMP::OpenMP_CXX), cmake
+###   checks whether there is any other libgomp.so with the different full path (after resolving
+###   soft links) in the search paths (implicit directoires + BUILD_RAPTH). There is one in the
+###   path included in BUILD_RPATH when ${CUDF_LIBRARY} is added; this one can
+###   potentially hide the one in the provided full path and cmake generates a warning (and RPATH
+###   is searched before the directories in /etc/ld.so/conf; ld.so.conf does not coincide but
+###   overlaps with implicit directories).
+###
+###   If we provide just the library names (gomp;pthread), cmake does not generate warnings (we
+###   did not specify which libgomp.so should be loaded in runtime), and the one first found in
+###   the search order is loaded (we can change the loaded library by setting LD_LIBRARY_PATH or
+###   manually editing BUILD_RPATH).
+###
+###   Manually editing BUILD_RPATH:
+###   set(TARGET_BUILD_RPATH "")
+###   foreach(TMP_VAR_FULLPATH IN LISTS OpenMP_CXX_LIBRARIES)
+###       get_filename_component(TMP_VAR_DIR ${TMP_VAR_FULLPATH} DIRECTORY)
+###       string(APPEND TARGET_BUILD_RPATH "${TMP_VAR_DIR};")
+###       get_filename_component(TMP_VAR_REALPATH ${TMP_VAR_FULLPATH} REALPATH)
+###       get_filename_component(TMP_VAR_DIR ${TMP_VAR_REALPATH} DIRECTORY)
+###       # cmake automatically removes duplicates, so skip checking.
+###       string(APPEND TARGET_BUILD_RPATH "${TMP_VAR_DIR};")
+###   endforeach()
+###   string(APPEND TARGET_BUILD_RPATH "${CONDA_PREFIX}/lib")
+###   message(STATUS "TARGET_BUILD_RPATH=${TARGET_BUILD_RPATH}")
+###   set_target_properties(target PROPERTIES
+###       BUILD_RPATH "${TARGET_BUILD_RPATH}")
+        ${OpenMP_CXX_LIB_NAMES})
+    endif(OpenMP_CXX_FOUND)
+
+    add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
+endfunction()
+
+function(ConfigureTestMG CMAKE_TEST_NAME)
+    add_executable(${CMAKE_TEST_NAME} ${ARGN})
+
     target_link_libraries(${CMAKE_TEST_NAME}
         PRIVATE
-        gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${CMAKE_EXTRA_LIBS}  ${NCCL_LIBRARIES} cudart cuda cublas cusparse cusolver curand)
+        cugraphmgtestutil
+        cugraphtestutil
+        cugraph
+        GTest::gmock
+        GTest::gmock_main
+        GTest::gtest
+        GTest::gtest_main
+        NCCL::NCCL
+        CUDA::cublas
+        CUDA::cusparse
+        CUDA::cusolver
+        CUDA::curand
+        MPI::MPI_CXX
+    )
 
     if(OpenMP_CXX_FOUND)
         target_link_libraries(${CMAKE_TEST_NAME} PRIVATE
@@ -96,10 +216,14 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
 ###       BUILD_RPATH "${TARGET_BUILD_RPATH}")
         ${OpenMP_CXX_LIB_NAMES})
     endif(OpenMP_CXX_FOUND)
-    set_target_properties(${CMAKE_TEST_NAME} PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests/")
 
-    add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
+    add_test(NAME ${CMAKE_TEST_NAME}
+             COMMAND ${MPIEXEC_EXECUTABLE}
+             ${MPIEXEC_NUMPROC_FLAG}
+             ${GPU_COUNT}
+             ${MPIEXEC_PREFLAGS}
+             ${CMAKE_TEST_NAME}
+             ${MPIEXEC_POSTFLAGS})
 endfunction()
 
 ###################################################################################################
@@ -117,206 +241,236 @@ endif(RAPIDS_DATASET_ROOT_DIR)
 ###################################################################################################
 
 ###################################################################################################
-# - katz centrality tests -------------------------------------------------------------------------
+# - graph generator tests -------------------------------------------------------------------------
 
-set(KATZ_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/centrality/katz_centrality_test.cu")
+set(GRAPH_GENERATORS_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/generators/generators_test.cpp")
 
-  ConfigureTest(KATZ_TEST "${KATZ_TEST_SRC}" "")
+  ConfigureTest(GRAPH_GENERATORS_TEST "${GRAPH_GENERATORS_TEST_SRC}")
 
 ###################################################################################################
-# - betweenness centrality tests ------------------------------------------------------------------
+# - erdos renyi graph generator tests -------------------------------------------------------------
 
-set(BETWEENNESS_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/centrality/betweenness_centrality_test.cu")
+set(ERDOS_RENYI_GENERATOR_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/generators/erdos_renyi_test.cpp")
 
-  ConfigureTest(BETWEENNESS_TEST "${BETWEENNESS_TEST_SRC}" "")
-
-set(EDGE_BETWEENNESS_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/centrality/edge_betweenness_centrality_test.cu")
-
-  ConfigureTest(EDGE_BETWEENNESS_TEST "${EDGE_BETWEENNESS_TEST_SRC}" "")
+  ConfigureTest(ERDOS_RENYI_GENERATOR_TEST "${ERDOS_RENYI_GENERATOR_TEST_SRC}")
 
 ###################################################################################################
-# - pagerank tests --------------------------------------------------------------------------------
-
-set(PAGERANK_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/pagerank/pagerank_test.cpp")
+# - katz centrality tests -------------------------------------------------------------------------
+ConfigureTest(KATZ_TEST centrality/katz_centrality_test.cu)
 
-ConfigureTest(PAGERANK_TEST "${PAGERANK_TEST_SRC}" "")
+###################################################################################################
+# - betweenness centrality tests ------------------------------------------------------------------
+ConfigureTest(BETWEENNESS_TEST centrality/betweenness_centrality_test.cu)
+ConfigureTest(EDGE_BETWEENNESS_TEST centrality/edge_betweenness_centrality_test.cu)
 
 ###################################################################################################
 # - SSSP tests ------------------------------------------------------------------------------------
-
-set(SSSP_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/sssp_test.cu")
-
-ConfigureTest(SSSP_TEST "${SSSP_TEST_SRCS}" "")
+ConfigureTest(SSSP_TEST traversal/sssp_test.cu)
 
 ###################################################################################################
 # - BFS tests -------------------------------------------------------------------------------------
-
-set(BFS_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/bfs_test.cu")
-
-ConfigureTest(BFS_TEST "${BFS_TEST_SRCS}" "")
+ConfigureTest(BFS_TEST traversal/bfs_test.cu)
 
 ###################################################################################################
 # - LOUVAIN tests ---------------------------------------------------------------------------------
-
-set(LOUVAIN_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_test.cu")
-
-ConfigureTest(LOUVAIN_TEST "${LOUVAIN_TEST_SRC}" "")
+ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp)
 
 ###################################################################################################
 # - LEIDEN tests ---------------------------------------------------------------------------------
-
-set(LEIDEN_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/leiden_test.cpp")
-
-ConfigureTest(LEIDEN_TEST "${LEIDEN_TEST_SRC}" "")
+ConfigureTest(LEIDEN_TEST community/leiden_test.cpp)
 
 ###################################################################################################
 # - ECG tests ---------------------------------------------------------------------------------
-
-set(ECG_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/ecg_test.cu")
-
-ConfigureTest(ECG_TEST "${ECG_TEST_SRC}" "")
+ConfigureTest(ECG_TEST community/ecg_test.cpp)
 
 ###################################################################################################
 # - Balanced cut clustering tests -----------------------------------------------------------------
-
-set(BALANCED_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/balanced_edge_test.cpp")
-
-ConfigureTest(BALANCED_TEST "${BALANCED_TEST_SRC}" "")
+ConfigureTest(BALANCED_TEST community/balanced_edge_test.cpp)
 
 ###################################################################################################
 # - TRIANGLE tests --------------------------------------------------------------------------------
-
-set(TRIANGLE_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/triangle_test.cu")
-
-ConfigureTest(TRIANGLE_TEST "${TRIANGLE_TEST_SRC}" "")
+ConfigureTest(TRIANGLE_TEST community/triangle_test.cu)
 
 ###################################################################################################
-# - RENUMBERING tests -----------------------------------------------------------------------------
-
-set(RENUMBERING_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/renumber/renumber_test.cu")
-
-ConfigureTest(RENUMBERING_TEST "${RENUMBERING_TEST_SRC}" "")
+# - EGO tests --------------------------------------------------------------------------------
+ConfigureTest(EGO_TEST community/egonet_test.cu)
 
 ###################################################################################################
 # - FORCE ATLAS 2  tests --------------------------------------------------------------------------
+ConfigureTest(FA2_TEST layout/force_atlas2_test.cu)
 
-set(FA2_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/layout/force_atlas2_test.cu")
-
-ConfigureTest(FA2_TEST "${FA2_TEST_SRC}" "")
+###################################################################################################
+# - TSP  tests --------------------------------------------------------------------------
+ConfigureTest(TSP_TEST traversal/tsp_test.cu)
 
 ###################################################################################################
 # - CONNECTED COMPONENTS  tests -------------------------------------------------------------------
-
-set(CONNECT_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/components/con_comp_test.cu")
-
-ConfigureTest(CONNECT_TEST "${CONNECT_TEST_SRC}" "")
+ConfigureTest(CONNECT_TEST components/con_comp_test.cu)
 
 ###################################################################################################
 # - STRONGLY CONNECTED COMPONENTS  tests ----------------------------------------------------------
+ConfigureTest(SCC_TEST components/scc_test.cu)
 
-set(SCC_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/components/scc_test.cu")
+###################################################################################################
+# - WEAKLY CONNECTED COMPONENTS  tests ----------------------------------------------------------
+ConfigureTest(WCC_TEST components/wcc_test.cpp)
 
-ConfigureTest(SCC_TEST "${SCC_TEST_SRC}" "")
+###################################################################################################
+#-Hungarian (Linear Assignment Problem)  tests ----------------------------------------------------
+ConfigureTest(HUNGARIAN_TEST linear_assignment/hungarian_test.cu)
 
 ###################################################################################################
-#-Hungarian (Linear Assignment Problem)  tests ---------------------------------------------------------------------
+# - MST tests ----------------------------------------------------------------------------
+ConfigureTest(MST_TEST tree/mst_test.cu)
 
-set(HUNGARIAN_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/linear_assignment/hungarian_test.cu")
+###################################################################################################
+# - Experimental stream tests -----------------------------------------------------
+ConfigureTest(EXPERIMENTAL_STREAM experimental/streams.cu)
 
-ConfigureTest(HUNGARIAN_TEST "${HUNGARIAN_TEST_SRC}" "")
+###################################################################################################
+# - Experimental R-mat graph generation tests -----------------------------------------------------
+ConfigureTest(EXPERIMENTAL_GENERATE_RMAT_TEST experimental/generate_rmat_test.cpp)
 
 ###################################################################################################
-# - MST tests ----------------------------------------------------------------------------
+# - Experimental Graph tests ----------------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_GRAPH_TEST experimental/graph_test.cpp)
 
-set(MST_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/tree/mst_test.cu")
+###################################################################################################
+# - Experimental weight-sum tests -----------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_WEIGHT_SUM_TEST experimental/weight_sum_test.cpp)
 
-ConfigureTest(MST_TEST "${MST_TEST_SRC}" "")
+###################################################################################################
+# - Experimental degree tests ---------------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_DEGREE_TEST experimental/degree_test.cpp)
 
+###################################################################################################
+# - Experimental coarsening tests -----------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_COARSEN_GRAPH_TEST experimental/coarsen_graph_test.cpp)
 
 ###################################################################################################
-# - Experimental Graph tests ----------------------------------------------------------------------
+# - Experimental induced subgraph tests -----------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_INDUCED_SUBGRAPH_TEST experimental/induced_subgraph_test.cpp)
 
-set(EXPERIMENTAL_GRAPH_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/graph_test.cpp")
+###################################################################################################
+# - Experimental BFS tests ------------------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_BFS_TEST experimental/bfs_test.cpp)
 
-ConfigureTest(EXPERIMENTAL_GRAPH_TEST "${EXPERIMENTAL_GRAPH_TEST_SRCS}" "")
+###################################################################################################
+# - Experimental Multi-source BFS tests -----------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_MSBFS_TEST experimental/ms_bfs_test.cpp)
 
 ###################################################################################################
-# - Experimental BFS tests ------------------------------------------------------------------------
+# - Experimental SSSP tests -----------------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_SSSP_TEST experimental/sssp_test.cpp)
 
-set(EXPERIMENTAL_BFS_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/bfs_test.cpp")
+###################################################################################################
+# - Experimental PAGERANK tests -------------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_PAGERANK_TEST experimental/pagerank_test.cpp)
 
-ConfigureTest(EXPERIMENTAL_BFS_TEST "${EXPERIMENTAL_BFS_TEST_SRCS}" "")
+###################################################################################################
+# - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------
+ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST experimental/katz_centrality_test.cpp)
 
 ###################################################################################################
-# - Experimental SSSP tests -----------------------------------------------------------------------
+# - WEAKLY CONNECTED COMPONENTS tests -------------------------------------------------------------
 
-set(EXPERIMENTAL_SSSP_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/sssp_test.cpp")
+set(WEAKLY_CONNECTED_COMPONENTS_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/components/weakly_connected_components_test.cpp")
 
-ConfigureTest(EXPERIMENTAL_SSSP_TEST "${EXPERIMENTAL_SSSP_TEST_SRCS}" "")
+ConfigureTest(WEAKLY_CONNECTED_COMPONENTS_TEST "${WEAKLY_CONNECTED_COMPONENTS_TEST_SRCS}")
 
 ###################################################################################################
-# - Experimental PAGERANK tests -------------------------------------------------------------------
+# - Experimental RANDOM_WALKS tests ---------------------------------------------------------------
+ConfigureTest(RANDOM_WALKS_TEST sampling/random_walks_test.cu)
 
-set(EXPERIMENTAL_PAGERANK_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/pagerank_test.cpp")
+###################################################################################################
+ConfigureTest(RANDOM_WALKS_LOW_LEVEL_TEST sampling/rw_low_level_test.cu)
 
-ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}" "")
+###################################################################################################
+# FIXME: since this is technically not a test, consider refactoring the the
+# ConfigureTest function to share common code with a new ConfigureBenchmark
+# function (which would not link gtest, etc.)
+ConfigureTest(RANDOM_WALKS_PROFILING sampling/random_walks_profiling.cu)
 
 ###################################################################################################
-# - Experimental LOUVAIN tests -------------------------------------------------------------------
+# - Serialization tests ---------------------------------------------------------------------------
+
+set(SERIALIZATION_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/serialization/un_serialize_test.cpp")
+
+ConfigureTest(SERIALIZATION_TEST "${SERIALIZATION_TEST_SRCS}")
 
-# FIXME: Re-enable once failures are fixed
-#set(EXPERIMENTAL_LOUVAIN_TEST_SRCS
-#    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-#    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/louvain_test.cu")
-#
-#ConfigureTest(EXPERIMENTAL_LOUVAIN_TEST "${EXPERIMENTAL_LOUVAIN_TEST_SRCS}" "")
 
 ###################################################################################################
-# - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------
+# - BFS Visitor tests -----------------------------------------------------------------------------
+
+set(BFS_VISITOR_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/visitors/bfs_test.cpp")
 
-set(EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/katz_centrality_test.cpp")
+ConfigureTest(BFS_VISITOR_TEST "${BFS_VISITOR_TEST_SRCS}")
 
-ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS}" "")
+###################################################################################################
+# - MG tests --------------------------------------------------------------------------------------
+
+if(BUILD_CUGRAPH_MG_TESTS)
+
+    ###############################################################################################
+    # - find MPI - only enabled if MG tests are to be built
+    find_package(MPI REQUIRED)
+
+    execute_process(
+      COMMAND nvidia-smi -L
+      COMMAND wc -l
+      OUTPUT_VARIABLE GPU_COUNT)
+
+    string(REGEX REPLACE "\n$" "" GPU_COUNT ${GPU_COUNT})
+    MESSAGE(STATUS "GPU_COUNT: " ${GPU_COUNT})
+
+    if(MPI_CXX_FOUND)
+        ###########################################################################################
+        # - MG PAGERANK tests ---------------------------------------------------------------------
+        ConfigureTestMG(MG_PAGERANK_TEST pagerank/mg_pagerank_test.cpp)
+
+        ###########################################################################################
+        # - MG KATZ CENTRALITY tests --------------------------------------------------------------
+        ConfigureTestMG(MG_KATZ_CENTRALITY_TEST experimental/mg_katz_centrality_test.cpp)
+
+        ###########################################################################################
+        # - MG BFS tests --------------------------------------------------------------------------
+        ConfigureTestMG(MG_BFS_TEST experimental/mg_bfs_test.cpp)
+
+        ###########################################################################################
+        # - MG SSSP tests -------------------------------------------------------------------------
+        ConfigureTestMG(MG_SSSP_TEST experimental/mg_sssp_test.cpp)
+
+        ###########################################################################################
+        # - MG LOUVAIN tests ----------------------------------------------------------------------
+        ConfigureTestMG(MG_LOUVAIN_TEST
+            community/mg_louvain_helper.cu
+            community/mg_louvain_test.cpp)
+
+        ###########################################################################################
+        # - MG WEAKLY CONNECTED COMPONENTS tests --------------------------------------------------
+        ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
+                        components/mg_weakly_connected_components_test.cpp)
+
+        ###########################################################################################
+        # - MG GRAPH BROADCAST tests --------------------------------------------------------------
+        ConfigureTestMG(MG_GRAPH_BROADCAST_TEST bcast/mg_graph_bcast.cpp)
+
+        ###########################################################################################
+        # - MG PRIMS COUNT_IF_V tests -------------------------------------------------------------
+        ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu)
+
+        ###########################################################################################
+        # - MG PRIMS REDUCE_V tests ---------------------------------------------------------------
+        ConfigureTestMG(MG_REDUCE_V_TEST prims/mg_reduce_v.cu)
+    else()
+       message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.")
+    endif()
+endif()
 
 ###################################################################################################
 ### enable testing ################################################################################
diff --git a/cpp/tests/README.md b/cpp/tests/README.md
new file mode 100644
index 00000000000..b5808822467
--- /dev/null
+++ b/cpp/tests/README.md
@@ -0,0 +1,31 @@
+# libcugraph C++ tests
+
+## Prerequisites
+### Datasets
+```
+/path/to/cuGraph> ./datasets/get_test_data.sh
+/path/to/cuGraph> export RAPIDS_DATASET_ROOT_DIR=/path/to/cuGraph/datasets
+```
+### System Requirements
+* MPI (multi-GPU tests only)
+   ```
+   conda install -c conda-forge openmpi
+   ```
+
+## Building
+```
+/path/to/cuGraph> ./build.sh libcugraph
+```
+To build the multi-GPU tests:
+```
+/path/to/cuGraph> ./build.sh libcugraph cpp-mgtests
+```
+
+## Running
+```
+<example here>
+```
+To run the multi-GPU tests (example using 2 GPUs):
+```
+/path/to/cuGraph> mpirun -n 2 ./cpp/build/gtests/MG_PAGERANK_TEST
+```
diff --git a/cpp/tests/bcast/mg_graph_bcast.cpp b/cpp/tests/bcast/mg_graph_bcast.cpp
new file mode 100644
index 00000000000..1a0d4c558c9
--- /dev/null
+++ b/cpp/tests/bcast/mg_graph_bcast.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cugraph/utilities/path_retrieval.hpp>
+
+////////////////////////////////////////////////////////////////////////////////
+// Test param object. This defines the input and expected output for a test, and
+// will be instantiated as the parameter to the tests defined below using
+// INSTANTIATE_TEST_SUITE_P()
+//
+struct GraphBcast_Usecase {
+  std::string graph_file_full_path{};
+
+  // FIXME:  We really should have a Graph_Testparms_Base class or something
+  //         like that which can handle this graph_full_path thing.
+  //
+  explicit GraphBcast_Usecase(std::string const& graph_file_path)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameterized test fixture, to be used with TEST_P().  This defines common
+// setup and teardown steps as well as common utilities used by each E2E MG
+// test.  In this case, each test is identical except for the inputs and
+// expected outputs, so the entire test is defined in the run_test() method.
+//
+class GraphBcast_MG_Testfixture : public ::testing::TestWithParam<GraphBcast_Usecase> {
+ public:
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  // Run once for each test instance
+  //
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of broadcasting a graph,
+  // by comparing the graph that was sent (`sg_graph`)
+  // with th eone that was received (`graph-copy`):
+  //
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_test(const GraphBcast_Usecase& param)
+  {
+    using namespace cugraph::broadcast;
+    using sg_graph_t = cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false>;
+
+    raft::handle_t handle;
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    const auto& comm = handle.get_comms();
+
+    auto const comm_rank = comm.get_rank();
+
+    auto [sg_graph, d_renumber_map_labels] =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+        handle, param.graph_file_full_path, true, /*renumber=*/false);
+
+    if (comm_rank == 0) {
+      graph_broadcast(handle, &sg_graph);
+    } else {
+      sg_graph_t* g_ignore{nullptr};
+      auto graph_copy       = graph_broadcast(handle, g_ignore);
+      auto [same, str_fail] = cugraph::test::compare_graphs(handle, sg_graph, graph_copy);
+
+      if (!same) std::cerr << "Graph comparison failed on " << str_fail << '\n';
+
+      ASSERT_TRUE(same);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+TEST_P(GraphBcast_MG_Testfixture, CheckInt32Int32Float)
+{
+  run_test<int32_t, int32_t, float>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         GraphBcast_MG_Testfixture,
+                         ::testing::Values(GraphBcast_Usecase("test/datasets/karate.mtx")
+                                           //,GraphBcast_Usecase("test/datasets/smallworld.mtx")
+                                           ));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/centrality/betweenness_centrality_test.cu b/cpp/tests/centrality/betweenness_centrality_test.cu
index d680574e10b..a171b0010d1 100644
--- a/cpp/tests/centrality/betweenness_centrality_test.cu
+++ b/cpp/tests/centrality/betweenness_centrality_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,14 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 #include <gmock/gmock.h>
 
@@ -49,68 +50,80 @@
 // C++ Reference Implementation
 // ============================================================================
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_accumulation(result_t *result,
+void ref_accumulation(result_t* result,
                       vertex_t const number_of_vertices,
-                      std::stack<vertex_t> &S,
-                      std::vector<std::vector<vertex_t>> &pred,
-                      std::vector<double> &sigmas,
-                      std::vector<double> &deltas,
+                      std::stack<vertex_t>& S,
+                      std::vector<std::vector<vertex_t>>& pred,
+                      std::vector<double>& sigmas,
+                      std::vector<double>& deltas,
                       vertex_t source)
 {
-  for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; }
+  for (vertex_t v = 0; v < number_of_vertices; ++v) {
+    deltas[v] = 0;
+  }
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
-    for (vertex_t v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); }
+    for (vertex_t v : pred[w]) {
+      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
+    }
     if (w != source) { result[w] += deltas[w]; }
   }
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_endpoints_accumulation(result_t *result,
+void ref_endpoints_accumulation(result_t* result,
                                 vertex_t const number_of_vertices,
-                                std::stack<vertex_t> &S,
-                                std::vector<std::vector<vertex_t>> &pred,
-                                std::vector<double> &sigmas,
-                                std::vector<double> &deltas,
+                                std::stack<vertex_t>& S,
+                                std::vector<std::vector<vertex_t>>& pred,
+                                std::vector<double>& sigmas,
+                                std::vector<double>& deltas,
                                 vertex_t source)
 {
   result[source] += S.size() - 1;
-  for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; }
+  for (vertex_t v = 0; v < number_of_vertices; ++v) {
+    deltas[v] = 0;
+  }
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
-    for (vertex_t v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); }
+    for (vertex_t v : pred[w]) {
+      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
+    }
     if (w != source) { result[w] += deltas[w] + 1; }
   }
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_edge_accumulation(result_t *result,
+void ref_edge_accumulation(result_t* result,
                            vertex_t const number_of_vertices,
-                           std::stack<vertex_t> &S,
-                           std::vector<std::vector<vertex_t>> &pred,
-                           std::vector<double> &sigmas,
-                           std::vector<double> &deltas,
+                           std::stack<vertex_t>& S,
+                           std::vector<std::vector<vertex_t>>& pred,
+                           std::vector<double>& sigmas,
+                           std::vector<double>& deltas,
                            vertex_t source)
 {
-  for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; }
+  for (vertex_t v = 0; v < number_of_vertices; ++v) {
+    deltas[v] = 0;
+  }
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
-    for (vertex_t v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); }
+    for (vertex_t v : pred[w]) {
+      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
+    }
     if (w != source) { result[w] += deltas[w]; }
   }
 }
 
 // Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001)
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_betweenness_centrality_impl(vertex_t *indices,
-                                           edge_t *offsets,
+void reference_betweenness_centrality_impl(vertex_t* indices,
+                                           edge_t* offsets,
                                            vertex_t const number_of_vertices,
-                                           result_t *result,
+                                           result_t* result,
                                            bool endpoints,
-                                           vertex_t const *sources,
+                                           vertex_t const* sources,
                                            vertex_t const number_of_sources)
 {
   std::queue<vertex_t> Q;
@@ -158,7 +171,7 @@ void reference_betweenness_centrality_impl(vertex_t *indices,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_rescale(result_t *result,
+void reference_rescale(result_t* result,
                        bool directed,
                        bool normalize,
                        bool endpoints,
@@ -189,25 +202,27 @@ void reference_rescale(result_t *result,
       rescale_factor *= (casted_number_of_vertices / casted_number_of_sources);
     }
   }
-  for (auto idx = 0; idx < number_of_vertices; ++idx) { result[idx] *= rescale_factor; }
+  for (auto idx = 0; idx < number_of_vertices; ++idx) {
+    result[idx] *= rescale_factor;
+  }
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 void reference_betweenness_centrality(
-  cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  result_t *result,
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  result_t* result,
   bool normalize,
   bool endpoints,  // This is not yet implemented
   vertex_t const number_of_sources,
-  vertex_t const *sources)
+  vertex_t const* sources)
 {
   vertex_t number_of_vertices = graph.number_of_vertices;
   edge_t number_of_edges      = graph.number_of_edges;
   thrust::host_vector<vertex_t> h_indices(number_of_edges);
   thrust::host_vector<edge_t> h_offsets(number_of_vertices + 1);
 
-  thrust::device_ptr<vertex_t> d_indices((vertex_t *)&graph.indices[0]);
-  thrust::device_ptr<edge_t> d_offsets((edge_t *)&graph.offsets[0]);
+  thrust::device_ptr<vertex_t> d_indices((vertex_t*)&graph.indices[0]);
+  thrust::device_ptr<edge_t> d_offsets((edge_t*)&graph.offsets[0]);
 
   thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin());
   thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin());
@@ -227,14 +242,14 @@ void reference_betweenness_centrality(
 // Explicit instantiation
 /*    FIXME!!!
 template void reference_betweenness_centrality<int, int, float, float>(
-  cugraph::GraphCSRView<int, int, float> const &,
+  cugraph::legacy::GraphCSRView<int, int, float> const &,
   float *,
   bool,
   bool,
   const int,
   int const *);
 template void reference_betweenness_centrality<int, int, double, double>(
-  cugraph::GraphCSRView<int, int, double> const &,
+  cugraph::legacy::GraphCSRView<int, int, double> const &,
   double *,
   bool,
   bool,
@@ -248,7 +263,7 @@ template void reference_betweenness_centrality<int, int, double, double>(
 // Compare while allowing relatie error of epsilon
 // zero_threshold indicates when  we should drop comparison for small numbers
 template <typename T, typename precision_t>
-bool compare_close(const T &a, const T &b, const precision_t epsilon, precision_t zero_threshold)
+bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
 {
   return ((zero_threshold > a && zero_threshold > b)) ||
          (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
@@ -264,12 +279,12 @@ typedef struct BC_Usecase_t {
   std::string config_;     // Path to graph file
   std::string file_path_;  // Complete path to graph using dataset_root_dir
   int number_of_sources_;  // Starting point from the traversal
-  BC_Usecase_t(const std::string &config, int number_of_sources)
+  BC_Usecase_t(const std::string& config, int number_of_sources)
     : config_(config), number_of_sources_(number_of_sources)
   {
     // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
     // FIXME: Use platform independent stuff from c++14/17 on compiler update
-    const std::string &rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
+    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
     if ((config_ != "") && (config_[0] != '/')) {
       file_path_ = rapidsDatasetRootDir + "/" + config_;
     } else {
@@ -300,15 +315,15 @@ class Tests_BC : public ::testing::TestWithParam<BC_Usecase> {
             typename result_t,
             bool normalize,
             bool endpoints>
-  void run_current_test(const BC_Usecase &configuration)
+  void run_current_test(const BC_Usecase& configuration)
   {
     // Step 1: Construction of the graph based on configuration
     bool is_directed = false;
     auto csr         = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
       is_directed, configuration.file_path_);
     cudaDeviceSynchronize();
-    cugraph::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
-    G.prop.directed                                     = is_directed;
+    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
+    G.prop.directed                                             = is_directed;
     CUDA_TRY(cudaGetLastError());
     std::vector<result_t> result(G.number_of_vertices, 0);
     std::vector<result_t> expected(G.number_of_vertices, 0);
@@ -323,7 +338,7 @@ class Tests_BC : public ::testing::TestWithParam<BC_Usecase> {
     std::vector<vertex_t> sources(configuration.number_of_sources_);
     thrust::sequence(thrust::host, sources.begin(), sources.end(), 0);
 
-    vertex_t *sources_ptr = nullptr;
+    vertex_t* sources_ptr = nullptr;
     if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
 
     reference_betweenness_centrality(
@@ -338,7 +353,7 @@ class Tests_BC : public ::testing::TestWithParam<BC_Usecase> {
                                     d_result.data().get(),
                                     normalize,
                                     endpoints,
-                                    static_cast<weight_t *>(nullptr),
+                                    static_cast<weight_t*>(nullptr),
                                     configuration.number_of_sources_,
                                     sources_ptr);
     cudaDeviceSynchronize();
@@ -363,6 +378,9 @@ TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_NO_ENDPOINTS)
   run_current_test<int, int, float, float, false, false>(GetParam());
 }
 
+#if 0
+// Temporarily disable some of the test combinations
+//  Full solution will be explored for issue #1555
 TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_NO_ENDPOINTS)
 {
   run_current_test<int, int, double, double, false, false>(GetParam());
@@ -372,6 +390,7 @@ TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_ENDPOINTS)
 {
   run_current_test<int, int, float, float, false, true>(GetParam());
 }
+#endif
 
 TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_ENDPOINTS)
 {
@@ -384,6 +403,9 @@ TEST_P(Tests_BC, CheckFP32_NORMALIZE_NO_ENDPOINTS)
   run_current_test<int, int, float, float, true, false>(GetParam());
 }
 
+#if 0
+// Temporarily disable some of the test combinations
+//  Full solution will be explored for issue #1555
 TEST_P(Tests_BC, CheckFP64_NORMALIZE_NO_ENDPOINTS)
 {
   run_current_test<int, int, double, double, true, false>(GetParam());
@@ -393,18 +415,29 @@ TEST_P(Tests_BC, CheckFP32_NORMALIZE_ENDPOINTS)
 {
   run_current_test<int, int, float, float, true, true>(GetParam());
 }
+#endif
 
 TEST_P(Tests_BC, CheckFP64_NORMALIZE_ENDPOINTS)
 {
   run_current_test<int, int, double, double, true, true>(GetParam());
 }
 
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_BC,
-                        ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
-                                          BC_Usecase("test/datasets/netscience.mtx", 0),
-                                          BC_Usecase("test/datasets/netscience.mtx", 4),
-                                          BC_Usecase("test/datasets/wiki2003.mtx", 4),
-                                          BC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
+#if 0
+// Temporarily disable some of the test combinations
+//  Full solution will be explored for issue #1555
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_BC,
+                         ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
+                                           BC_Usecase("test/datasets/netscience.mtx", 0),
+                                           BC_Usecase("test/datasets/netscience.mtx", 4),
+                                           BC_Usecase("test/datasets/wiki2003.mtx", 4),
+                                           BC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
+#else
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_BC,
+                         ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
+                                           BC_Usecase("test/datasets/netscience.mtx", 0),
+                                           BC_Usecase("test/datasets/netscience.mtx", 4)));
+#endif
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/centrality/edge_betweenness_centrality_test.cu b/cpp/tests/centrality/edge_betweenness_centrality_test.cu
index b6cce8684e8..67fdb22f953 100644
--- a/cpp/tests/centrality/edge_betweenness_centrality_test.cu
+++ b/cpp/tests/centrality/edge_betweenness_centrality_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,11 +22,12 @@
 #include <raft/handle.hpp>
 
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 #include <gmock/gmock.h>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <fstream>
 #include <queue>
@@ -52,8 +53,8 @@
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 edge_t get_edge_index_from_source_and_destination(vertex_t source_vertex,
                                                   vertex_t destination_vertex,
-                                                  vertex_t const *indices,
-                                                  edge_t const *offsets)
+                                                  vertex_t const* indices,
+                                                  edge_t const* offsets)
 {
   edge_t index          = -1;
   edge_t first_edge_idx = offsets[source_vertex];
@@ -64,17 +65,19 @@ edge_t get_edge_index_from_source_and_destination(vertex_t source_vertex,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_accumulation(result_t *result,
-                      vertex_t const *indices,
-                      edge_t const *offsets,
+void ref_accumulation(result_t* result,
+                      vertex_t const* indices,
+                      edge_t const* offsets,
                       vertex_t const number_of_vertices,
-                      std::stack<vertex_t> &S,
-                      std::vector<std::vector<vertex_t>> &pred,
-                      std::vector<double> &sigmas,
-                      std::vector<double> &deltas,
+                      std::stack<vertex_t>& S,
+                      std::vector<std::vector<vertex_t>>& pred,
+                      std::vector<double>& sigmas,
+                      std::vector<double>& deltas,
                       vertex_t source)
 {
-  for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; }
+  for (vertex_t v = 0; v < number_of_vertices; ++v) {
+    deltas[v] = 0;
+  }
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
@@ -92,11 +95,11 @@ void ref_accumulation(result_t *result,
 
 // Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001)
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_edge_betweenness_centrality_impl(vertex_t *indices,
-                                                edge_t *offsets,
+void reference_edge_betweenness_centrality_impl(vertex_t* indices,
+                                                edge_t* offsets,
                                                 vertex_t const number_of_vertices,
-                                                result_t *result,
-                                                vertex_t const *sources,
+                                                result_t* result,
+                                                vertex_t const* sources,
                                                 vertex_t const number_of_sources)
 {
   std::queue<vertex_t> Q;
@@ -134,7 +137,7 @@ void reference_edge_betweenness_centrality_impl(vertex_t *indices,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_rescale(result_t *result,
+void reference_rescale(result_t* result,
                        bool directed,
                        bool normalize,
                        vertex_t const number_of_vertices,
@@ -149,24 +152,26 @@ void reference_rescale(result_t *result,
   } else {
     if (!directed) { rescale_factor /= static_cast<result_t>(2); }
   }
-  for (auto idx = 0; idx < number_of_edges; ++idx) { result[idx] *= rescale_factor; }
+  for (auto idx = 0; idx < number_of_edges; ++idx) {
+    result[idx] *= rescale_factor;
+  }
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 void reference_edge_betweenness_centrality(
-  cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  result_t *result,
+  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
+  result_t* result,
   bool normalize,
   vertex_t const number_of_sources,
-  vertex_t const *sources)
+  vertex_t const* sources)
 {
   vertex_t number_of_vertices = graph.number_of_vertices;
   edge_t number_of_edges      = graph.number_of_edges;
   thrust::host_vector<vertex_t> h_indices(number_of_edges);
   thrust::host_vector<edge_t> h_offsets(number_of_vertices + 1);
 
-  thrust::device_ptr<vertex_t> d_indices((vertex_t *)&graph.indices[0]);
-  thrust::device_ptr<edge_t> d_offsets((edge_t *)&graph.offsets[0]);
+  thrust::device_ptr<vertex_t> d_indices((vertex_t*)&graph.indices[0]);
+  thrust::device_ptr<edge_t> d_offsets((edge_t*)&graph.offsets[0]);
 
   thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin());
   thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin());
@@ -185,7 +190,7 @@ void reference_edge_betweenness_centrality(
 // Compare while allowing relatie error of epsilon
 // zero_threshold indicates when  we should drop comparison for small numbers
 template <typename T, typename precision_t>
-bool compare_close(const T &a, const T &b, const precision_t epsilon, precision_t zero_threshold)
+bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
 {
   return ((zero_threshold > a && zero_threshold > b)) ||
          (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
@@ -201,12 +206,12 @@ typedef struct EdgeBC_Usecase_t {
   std::string config_;     // Path to graph file
   std::string file_path_;  // Complete path to graph using dataset_root_dir
   int number_of_sources_;  // Starting point from the traversal
-  EdgeBC_Usecase_t(const std::string &config, int number_of_sources)
+  EdgeBC_Usecase_t(const std::string& config, int number_of_sources)
     : config_(config), number_of_sources_(number_of_sources)
   {
     // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
     // FIXME: Use platform independent stuff from c++14/17 on compiler update
-    const std::string &rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
+    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
     if ((config_ != "") && (config_[0] != '/')) {
       file_path_ = rapidsDatasetRootDir + "/" + config_;
     } else {
@@ -236,15 +241,15 @@ class Tests_EdgeBC : public ::testing::TestWithParam<EdgeBC_Usecase> {
             typename weight_t,
             typename result_t,
             bool normalize>
-  void run_current_test(const EdgeBC_Usecase &configuration)
+  void run_current_test(const EdgeBC_Usecase& configuration)
   {
     // Step 1: Construction of the graph based on configuration
     bool is_directed = false;
     auto csr         = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
       is_directed, configuration.file_path_);
     cudaDeviceSynchronize();
-    cugraph::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
-    G.prop.directed                                     = is_directed;
+    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
+    G.prop.directed                                             = is_directed;
     CUDA_TRY(cudaGetLastError());
     std::vector<result_t> result(G.number_of_edges, 0);
     std::vector<result_t> expected(G.number_of_edges, 0);
@@ -259,7 +264,7 @@ class Tests_EdgeBC : public ::testing::TestWithParam<EdgeBC_Usecase> {
     std::vector<vertex_t> sources(configuration.number_of_sources_);
     thrust::sequence(thrust::host, sources.begin(), sources.end(), 0);
 
-    vertex_t *sources_ptr = nullptr;
+    vertex_t* sources_ptr = nullptr;
     if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
 
     reference_edge_betweenness_centrality(
@@ -273,7 +278,7 @@ class Tests_EdgeBC : public ::testing::TestWithParam<EdgeBC_Usecase> {
                                          G,
                                          d_result.data().get(),
                                          normalize,
-                                         static_cast<weight_t *>(nullptr),
+                                         static_cast<weight_t*>(nullptr),
                                          configuration.number_of_sources_,
                                          sources_ptr);
     CUDA_TRY(cudaMemcpy(result.data(),
@@ -296,6 +301,9 @@ TEST_P(Tests_EdgeBC, CheckFP32_NO_NORMALIZE)
   run_current_test<int, int, float, float, false>(GetParam());
 }
 
+#if 0
+// Temporarily disable some of the test combinations
+//  Full solution will be explored for issue #1555
 TEST_P(Tests_EdgeBC, CheckFP64_NO_NORMALIZE)
 {
   run_current_test<int, int, double, double, false>(GetParam());
@@ -306,18 +314,29 @@ TEST_P(Tests_EdgeBC, CheckFP32_NORMALIZE)
 {
   run_current_test<int, int, float, float, true>(GetParam());
 }
+#endif
 
 TEST_P(Tests_EdgeBC, CheckFP64_NORMALIZE)
 {
   run_current_test<int, int, double, double, true>(GetParam());
 }
 
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_EdgeBC,
-                        ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
-                                          EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
-                                          EdgeBC_Usecase("test/datasets/netscience.mtx", 4),
-                                          EdgeBC_Usecase("test/datasets/wiki2003.mtx", 4),
-                                          EdgeBC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
+#if 0
+// Temporarily disable some of the test combinations
+//  Full solution will be explored for issue #1555
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_EdgeBC,
+                         ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
+                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
+                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 4),
+                                           EdgeBC_Usecase("test/datasets/wiki2003.mtx", 4),
+                                           EdgeBC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
+#else
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_EdgeBC,
+                         ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
+                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
+                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 4)));
+#endif
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/centrality/katz_centrality_test.cu b/cpp/tests/centrality/katz_centrality_test.cu
index c4f17192955..ee2df5347fc 100644
--- a/cpp/tests/centrality/katz_centrality_test.cu
+++ b/cpp/tests/centrality/katz_centrality_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@
 
 #include <converters/COOtoCSR.cuh>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <gmock/gmock-generated-matchers.h>
 #include <gmock/gmock.h>
@@ -35,7 +35,9 @@ std::vector<int> getGoldenTopKIds(std::ifstream& fs_result, int k = 10)
   std::vector<int> vec;
   int val;
   int count = 0;
-  while (fs_result >> val && ((count++) < k)) { vec.push_back(val); }
+  while (fs_result >> val && ((count++) < k)) {
+    vec.push_back(val);
+  }
   vec.resize(k);
   return vec;
 }
@@ -56,13 +58,13 @@ std::vector<int> getTopKIds(double* p_katz, int count, int k = 10)
 }
 
 template <typename VT, typename ET, typename WT>
-int getMaxDegree(cugraph::GraphCSRView<VT, ET, WT> const& g)
+int getMaxDegree(cugraph::legacy::GraphCSRView<VT, ET, WT> const& g)
 {
   cudaStream_t stream{nullptr};
 
   rmm::device_vector<ET> degree_vector(g.number_of_vertices);
   ET* p_degree = degree_vector.data().get();
-  g.degree(p_degree, cugraph::DegreeDirection::OUT);
+  g.degree(p_degree, cugraph::legacy::DegreeDirection::OUT);
   ET max_out_degree = thrust::reduce(rmm::exec_policy(stream)->on(stream),
                                      p_degree,
                                      p_degree + g.number_of_vertices,
@@ -137,9 +139,10 @@ class Tests_Katz : public ::testing::TestWithParam<Katz_Usecase> {
       << "\n";
     ASSERT_EQ(fclose(fpin), 0);
 
-    cugraph::GraphCOOView<int, int, float> cooview(&cooColInd[0], &cooRowInd[0], nullptr, m, nnz);
-    auto csr                                 = cugraph::coo_to_csr(cooview);
-    cugraph::GraphCSRView<int, int, float> G = csr->view();
+    cugraph::legacy::GraphCOOView<int, int, float> cooview(
+      &cooColInd[0], &cooRowInd[0], nullptr, m, nnz);
+    auto csr                                         = cugraph::coo_to_csr(cooview);
+    cugraph::legacy::GraphCSRView<int, int, float> G = csr->view();
 
     rmm::device_vector<double> katz_vector(m);
     double* d_katz = thrust::raw_pointer_cast(katz_vector.data());
@@ -156,7 +159,7 @@ class Tests_Katz : public ::testing::TestWithParam<Katz_Usecase> {
   }
 };
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_Katz,
   ::testing::Values(Katz_Usecase("test/datasets/karate.mtx", "ref/katz/karate.csv"),
diff --git a/cpp/tests/community/balanced_edge_test.cpp b/cpp/tests/community/balanced_edge_test.cpp
index 81cee945821..d4c5edf3f35 100644
--- a/cpp/tests/community/balanced_edge_test.cpp
+++ b/cpp/tests/community/balanced_edge_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -10,7 +10,7 @@
  */
 #include <utilities/base_fixture.hpp>
 
-#include <algorithms.hpp>
+#include <cugraph/algorithms.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -48,7 +48,7 @@ TEST(balanced_edge, success)
   rmm::device_vector<float> weights_v(w_h);
   rmm::device_vector<int> result_v(cluster_id);
 
-  cugraph::GraphCSRView<int, int, float> G(
+  cugraph::legacy::GraphCSRView<int, int, float> G(
     offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
 
   int num_clusters{8};
diff --git a/cpp/tests/community/ecg_test.cu b/cpp/tests/community/ecg_test.cpp
similarity index 70%
rename from cpp/tests/community/ecg_test.cu
rename to cpp/tests/community/ecg_test.cpp
index 85b80b1610b..f174d882937 100644
--- a/cpp/tests/community/ecg_test.cu
+++ b/cpp/tests/community/ecg_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -10,8 +10,8 @@
  */
 #include <utilities/base_fixture.hpp>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -47,7 +47,7 @@ TEST(ecg, success)
   rmm::device_vector<float> weights_v(w_h);
   rmm::device_vector<int> result_v(cluster_id);
 
-  cugraph::GraphCSRView<int, int, float> graph_csr(
+  cugraph::legacy::GraphCSRView<int, int, float> graph_csr(
     offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
 
   raft::handle_t handle;
@@ -77,6 +77,10 @@ TEST(ecg, success)
 
 TEST(ecg, dolphin)
 {
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
   std::vector<int> off_h = {0,   6,   14,  18,  21,  22,  26,  32,  37,  43,  50,  55,  56,
                             57,  65,  77,  84,  90,  99,  106, 110, 119, 125, 126, 129, 135,
                             138, 141, 146, 151, 160, 165, 166, 169, 179, 184, 185, 192, 203,
@@ -103,38 +107,57 @@ TEST(ecg, dolphin)
   int num_verts = off_h.size() - 1;
   int num_edges = ind_h.size();
 
-  thrust::host_vector<int> cluster_id(num_verts, -1);
+  std::vector<int> cluster_id(num_verts, -1);
 
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
 
-  cugraph::GraphCSRView<int, int, float> graph_csr(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
 
-  raft::handle_t handle;
-  cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data().get());
+  cugraph::legacy::GraphCSRView<int, int, float> graph_csr(
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
-  cluster_id = result_v;
-  int max    = *max_element(cluster_id.begin(), cluster_id.end());
-  int min    = *min_element(cluster_id.begin(), cluster_id.end());
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(
+      (cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data())),
+      cugraph::logic_error);
+  } else {
+    cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data());
 
-  ASSERT_EQ((min >= 0), 1);
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
 
-  std::set<int> cluster_ids;
-  for (auto c : cluster_id) { cluster_ids.insert(c); }
+    CUDA_TRY(cudaDeviceSynchronize());
 
-  ASSERT_EQ(cluster_ids.size(), size_t(max + 1));
+    int max = *max_element(cluster_id.begin(), cluster_id.end());
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-  float modularity{0.0};
+    ASSERT_EQ((min >= 0), 1);
 
-  cugraph::ext_raft::analyzeClustering_modularity(
-    graph_csr, max + 1, result_v.data().get(), &modularity);
+    std::set<int> cluster_ids;
+    for (auto c : cluster_id) {
+      cluster_ids.insert(c);
+    }
+
+    ASSERT_EQ(cluster_ids.size(), size_t(max + 1));
+
+    float modularity{0.0};
+
+    cugraph::ext_raft::analyzeClustering_modularity(
+      graph_csr, max + 1, result_v.data(), &modularity);
 
-  float random_modularity{0.95 * 0.4962422251701355};
+    float random_modularity{0.95 * 0.4962422251701355};
 
-  ASSERT_GT(modularity, random_modularity);
+    ASSERT_GT(modularity, random_modularity);
+  }
 }
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/egonet_test.cu b/cpp/tests/community/egonet_test.cu
new file mode 100644
index 00000000000..29be3508de7
--- /dev/null
+++ b/cpp/tests/community/egonet_test.cu
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/high_res_timer.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+#include <cuda_profiler_api.h>
+
+typedef struct InducedEgo_Usecase_t {
+  std::string graph_file_full_path{};
+  std::vector<int32_t> ego_sources{};
+  int32_t radius;
+  bool test_weighted{false};
+
+  InducedEgo_Usecase_t(std::string const& graph_file_path,
+                       std::vector<int32_t> const& ego_sources,
+                       int32_t radius,
+                       bool test_weighted)
+    : ego_sources(ego_sources), radius(radius), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} InducedEgo_Usecase;
+
+class Tests_InducedEgo : public ::testing::TestWithParam<InducedEgo_Usecase> {
+ public:
+  Tests_InducedEgo() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(InducedEgo_Usecase const& configuration)
+  {
+    int n_streams = std::min(configuration.ego_sources.size(), static_cast<std::size_t>(128));
+    raft::handle_t handle(n_streams);
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> graph(
+      handle);
+    std::tie(graph, std::ignore) = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+    auto graph_view = graph.view();
+
+    rmm::device_uvector<vertex_t> d_ego_sources(configuration.ego_sources.size(),
+                                                handle.get_stream());
+
+    raft::update_device(d_ego_sources.data(),
+                        configuration.ego_sources.data(),
+                        configuration.ego_sources.size(),
+                        handle.get_stream());
+
+    HighResTimer hr_timer;
+    hr_timer.start("egonet");
+    cudaProfilerStart();
+    auto [d_ego_edgelist_src, d_ego_edgelist_dst, d_ego_edgelist_weights, d_ego_edge_offsets] =
+      cugraph::experimental::extract_ego(handle,
+                                         graph_view,
+                                         d_ego_sources.data(),
+                                         static_cast<vertex_t>(configuration.ego_sources.size()),
+                                         configuration.radius);
+    cudaProfilerStop();
+    hr_timer.stop();
+    hr_timer.display(std::cout);
+    std::vector<size_t> h_cugraph_ego_edge_offsets(d_ego_edge_offsets.size());
+    std::vector<vertex_t> h_cugraph_ego_edgelist_src(d_ego_edgelist_src.size());
+    std::vector<vertex_t> h_cugraph_ego_edgelist_dst(d_ego_edgelist_dst.size());
+    raft::update_host(h_cugraph_ego_edgelist_src.data(),
+                      d_ego_edgelist_src.data(),
+                      d_ego_edgelist_src.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_ego_edgelist_dst.data(),
+                      d_ego_edgelist_dst.data(),
+                      d_ego_edgelist_dst.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_ego_edge_offsets.data(),
+                      d_ego_edge_offsets.data(),
+                      d_ego_edge_offsets.size(),
+                      handle.get_stream());
+    ASSERT_TRUE(d_ego_edge_offsets.size() == (configuration.ego_sources.size() + 1));
+    ASSERT_TRUE(d_ego_edgelist_src.size() == d_ego_edgelist_dst.size());
+    if (configuration.test_weighted)
+      ASSERT_TRUE(d_ego_edgelist_src.size() == (*d_ego_edgelist_weights).size());
+    ASSERT_TRUE(h_cugraph_ego_edge_offsets[configuration.ego_sources.size()] ==
+                d_ego_edgelist_src.size());
+    for (size_t i = 0; i < configuration.ego_sources.size(); i++)
+      ASSERT_TRUE(h_cugraph_ego_edge_offsets[i] <= h_cugraph_ego_edge_offsets[i + 1]);
+    auto n_vertices = graph_view.get_number_of_vertices();
+    for (size_t i = 0; i < d_ego_edgelist_src.size(); i++) {
+      ASSERT_TRUE(
+        cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i]));
+      ASSERT_TRUE(
+        cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i]));
+    }
+  }
+};
+
+TEST_P(Tests_InducedEgo, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_InducedEgo,
+  ::testing::Values(
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{0}, 1, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{0}, 2, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{1}, 3, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{10, 0, 5}, 2, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{9, 3, 10}, 2, false),
+    InducedEgo_Usecase(
+      "test/datasets/karate.mtx", std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true)));
+
+// For perf analysis
+/*
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_InducedEgo,
+  ::testing::Values(
+    InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{0}, 1, false),
+    InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{0}, 2, false),
+    InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{0}, 3, false),
+    InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{0}, 4, false),
+    InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{0}, 5, false),
+    InducedEgo_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{363617}, 2, false),
+    InducedEgo_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx",
+      std::vector<int32_t>{
+        363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755},
+      2,
+      false),
+    InducedEgo_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx",
+      std::vector<int32_t>{
+        363617,  722214,  2337449, 2510183, 2513389, 225853,  2035807, 3836330, 1865496, 28755,
+        2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686,
+        2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102,
+        4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953,  2388331, 520808,
+        3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680,  847662,  3277365, 3957318,
+        3455123, 2454259, 670953,  4465677, 1027332, 2560721, 89061,   1163406, 3109528, 3221856,
+        4714426, 2382774, 37828,   4433616, 3283229, 591911,  4200188, 442522,  872207,  2437601,
+        741003,  266241,  914618,  3626195, 2021080, 4679624, 777476,  2527796, 1114017, 640142,
+        49259,   4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062,
+        2029646, 4575891, 1488598, 79105,   4827273, 3795434, 4647518, 4733397, 3980718, 1184627},
+      2,
+      false),
+    InducedEgo_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx",
+      std::vector<int32_t>{
+        363617,  722214,  2337449, 2510183, 2513389, 225853,  2035807, 3836330, 1865496, 28755,
+        2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686,
+        2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102,
+        4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953,  2388331, 520808,
+        3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680,  847662,  3277365, 3957318,
+        3455123, 2454259, 670953,  4465677, 1027332, 2560721, 89061,   1163406, 3109528, 3221856,
+        4714426, 2382774, 37828,   4433616, 3283229, 591911,  4200188, 442522,  872207,  2437601,
+        741003,  266241,  914618,  3626195, 2021080, 4679624, 777476,  2527796, 1114017, 640142,
+        49259,   4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062,
+        2029646, 4575891, 1488598, 79105,   4827273, 3795434, 4647518, 4733397, 3980718, 1184627,
+        984983,  3114832, 1967741, 1599818, 144593,  2698770, 2889449, 2495550, 1053813, 1193622,
+        686026,  3989015, 2040719, 4693428, 3190376, 2926728, 3399030, 1664419, 662429,  4526841,
+        2186957, 3752558, 2440046, 2930226, 3633006, 4058166, 3137060, 3499296, 2126343, 148971,
+        2199672, 275811,  2813976, 2274536, 1189239, 1335942, 2465624, 2596042, 829684,  193400,
+        2682845, 3691697, 4022437, 4051170, 4195175, 2876420, 3984220, 2174475, 326134,  2606530,
+        2493046, 4706121, 1498980, 4576225, 1271339, 44832,   1875673, 4664940, 134931,  736397,
+        4333554, 2751031, 2163610, 2879676, 3174153, 3317403, 2052464, 1881883, 4757859, 3596257,
+        2358088, 2578758, 447504,  590720,  1717038, 1869795, 1133885, 3027521, 840312,  2818881,
+        3654321, 2730947, 353585,  1134903, 2223378, 1508824, 3662521, 1363776, 2712071, 288441,
+        1204581, 3502242, 4645567, 2767267, 1514366, 3956099, 1422145, 1216608, 2253360, 189132,
+        4238225, 1345783, 451571,  1599442, 3237284, 4711405, 929446,  1857675, 150759,  1277633,
+        761210,  138628,  1026833, 2599544, 2464737, 989203,  3399615, 2144292, 216142,  637312,
+        2044964, 716256,  1660632, 1762919, 4784357, 2213415, 2764769, 291806,  609772,  3264819,
+        1870953, 1516385, 235647,  1045474, 2664957, 819095,  1824119, 4045271, 4448109, 1676788,
+        4285177, 1580502, 3546548, 2771971, 3927086, 1339779, 3156204, 1730998, 1172522, 2433024,
+        4533449, 479930,  2010695, 672994,  3542039, 3176455, 26352,   2137735, 866910,  4410835,
+        2623982, 3603159, 2555625, 2765653, 267865,  2015523, 1009052, 4713994, 1600667, 2176195,
+        3179631, 4570390, 2018424, 3356384, 1784287, 894861,  3622099, 1647273, 3044136, 950354,
+        1491760, 3416929, 3757300, 2244912, 4129215, 1600848, 3867343, 72329,   919189,  992521,
+        3445975, 4712557, 4680974, 188419,  2612093, 1991268, 3566207, 2281468, 3859078, 2492806,
+        3398628, 763441,  2679107, 2554420, 2130132, 4664374, 1182901, 3890770, 4714667, 4209303,
+        4013060, 3617653, 2040022, 3296519, 4190671, 1693353, 2678411, 3788834, 2781815, 191965,
+        1083926, 503974,  3529226, 1650522, 1900976, 542080,  3423929, 3418905, 878165,  4701703,
+        3022790, 4316365, 76365,   4053672, 1358185, 3830478, 4445661, 3210024, 1895915, 4541133,
+        2938808, 562788,  3920065, 1458776, 4052046, 2967475, 1092809, 3203538, 159626,  3399464,
+        214467,  3343982, 1811854, 3189045, 4272117, 4701563, 424807,  4341116, 760545,  4674683,
+        1538018, 386762,  194237,  2162719, 1694433, 943728,  2389036, 2196653, 3085571, 1513424,
+        3689413, 3278747, 4197291, 3324063, 3651090, 1737936, 2768803, 2768889, 3108096, 4311775,
+        3569480, 886705,  733256,  2477493, 1735412, 2960895, 1983781, 1861797, 3566460, 4537673,
+        1164093, 3499764, 4553071, 3518985, 847658,  918948,  2922351, 1056144, 652895,  1013195,
+        780505,  1702928, 3562838, 1432719, 2405207, 1054920, 641647,  2240939, 3617702, 383165,
+        652641,  879593,  1810739, 2096385, 4497865, 4768530, 1743968, 3582014, 1025009, 3002122,
+        2422190, 527647,  1251821, 2571153, 4095874, 3705333, 3637407, 1385567, 4043855, 4041930,
+        2433139, 1710383, 1127734, 4362316, 711588,  817839,  3214775, 910077,  1313768, 2382229,
+        16864,   2081770, 3095420, 3195272, 548711,  2259860, 1167323, 2435974, 425238,  2085179,
+        2630042, 2632881, 2867923, 3703565, 1037695, 226617,  4379130, 1541468, 3581937, 605965,
+        1137674, 4655221, 4769963, 1394370, 4425315, 2990132, 2364485, 1561137, 2713384, 481509,
+        2900382, 934766,  2986774, 1767669, 298593,  2502539, 139296,  3794229, 4002180, 4718138,
+        2909238, 423691,  3023810, 2784924, 2760160, 1971980, 316683,  3828090, 3253691, 4839313,
+        1203624, 584938,  3901482, 1747543, 1572737, 3533226, 774708,  1691195, 1037110, 1557763,
+        225120,  4424243, 3524086, 1717663, 4332507, 3513592, 4274932, 1232118, 873498,  1416042,
+        2488925, 111391,  4704545, 4492545, 445317,  1584812, 2187737, 2471948, 3731678, 219255,
+        2282627, 2589971, 2372185, 4609096, 3673961, 2524410, 12823,   2437155, 3015974, 4188352,
+        3184084, 3690756, 1222341, 1278376, 3652030, 4162647, 326548,  3930062, 3926100, 1551222,
+        2722165, 4526695, 3997534, 4815513, 3139056, 2547644, 3028915, 4149092, 3656554, 2691582,
+        2676699, 1878842, 260174,  3129900, 4379993, 182347,  2189338, 3783616, 2616666, 2596952,
+        243007,  4179282, 2730,    1939894, 2332032, 3335636, 182332,  3112260, 2174584, 587481,
+        4527368, 3154106, 3403059, 673206,  2150292, 446521,  1600204, 4819428, 2591357, 48490,
+        2917012, 2285923, 1072926, 2824281, 4364250, 956033,  311938,  37251,   3729300, 2726300,
+        644966,  1623020, 1419070, 4646747, 2417222, 2680238, 2561083, 1793801, 2349366, 339747,
+        611366,  4684147, 4356907, 1277161, 4510381, 3218352, 4161658, 3200733, 1172372, 3997786,
+        3169266, 3353418, 2248955, 2875885, 2365369, 498208,  2968066, 2681505, 2059048, 2097106,
+        3607540, 1121504, 2016789, 1762605, 3138431, 866081,  3705757, 3833066, 2599788, 760816,
+        4046672, 1544367, 2983906, 4842911, 209599,  1250954, 3333704, 561212,  4674336, 2831841,
+        3690724, 2929360, 4830834, 1177524, 2487687, 3525137, 875283,  651241,  2110742, 1296646,
+        1543739, 4349417, 2384725, 1931751, 1519208, 1520034, 3385008, 3219962, 734912,  170230,
+        1741419, 729913,  2860117, 2362381, 1199807, 2424230, 177824,  125948,  2722701, 4687548,
+        1140771, 3232742, 4522020, 4376360, 1125603, 590312,  2481884, 138951,  4086775, 615155,
+        3395781, 4587272, 283209,  568470,  4296185, 4344150, 2454321, 2672602, 838828,  4051647,
+        1709120, 3074610, 693235,  4356087, 3018806, 239410,  2431497, 691186,  766276,  4462126,
+        859155,  2370304, 1571808, 1938673, 1694955, 3871296, 4245059, 3987376, 301524,  2512461,
+        3410437, 3300380, 684922,  4581995, 3599557, 683515,  1850634, 3704678, 1937490, 2035591,
+        3718533, 2065879, 3160765, 1467884, 1912241, 2501509, 3668572, 3390469, 2501150, 612319,
+        713633,  1976262, 135946,  3641535, 632083,  13414,   4217765, 4137712, 2550250, 3281035,
+        4179598, 961045,  2020694, 4380006, 1345936, 289162,  1359035, 770872,  4509911, 3947317,
+        4719693, 248568,  2625660, 1237232, 2153208, 4814282, 1259954, 3677369, 861222,  2883506,
+        3339149, 3998335, 491017,  1609022, 2648112, 742132,  649609,  4206953, 3131106, 3504814,
+        3344486, 611721,  3215620, 2856233, 4447505, 1949222, 1868345, 712710,  6966,    4730666,
+        3181872, 2972889, 3038521, 3525444, 4385208, 1845613, 1124187, 2030476, 4468651, 2478792,
+        3473580, 3783357, 1852991, 1648485, 871319,  1670723, 4458328, 3218600, 1811100, 3443356,
+        2233873, 3035207, 2548692, 3337891, 3773674, 1552957, 4782811, 3144712, 3523466, 1491315,
+        3955852, 1838410, 3164028, 1092543, 776459,  2959379, 2541744, 4064418, 3908320, 2854145,
+        3960709, 1348188, 977678,  853619,  1304291, 2848702, 1657913, 1319826, 3322665, 788037,
+        2913686, 4471279, 1766285, 348304,  56570,   1892118, 4017244, 401006,  3524539, 4310134,
+        1624693, 4081113, 957511,  849400,  129975,  2616130, 378537,  1556787, 3916162, 1039980,
+        4407778, 2027690, 4213675, 839863,  683134,  75805,   2493150, 4215796, 81587,   751845,
+        1255588, 1947964, 1950470, 859401,  3077088, 3931110, 2316256, 1523761, 4527477, 4237511,
+        1123513, 4209796, 3584772, 4250563, 2091754, 1618766, 2139944, 4525352, 382159,  2955887,
+        41760,   2313998, 496912,  3791570, 3904792, 3613654, 873959,  127076,  2537797, 2458107,
+        4543265, 3661909, 26828,   271816,  17854,   2461269, 1776042, 1573899, 3409957, 4335712,
+        4534313, 3392751, 1230124, 2159031, 4444015, 3373087, 3848014, 2026600, 1382747, 3537242,
+        4536743, 4714155, 3788371, 3570849, 173741,  211962,  4377778, 119369,  2856973, 2945854,
+        1508054, 4503932, 3141566, 1842177, 3448683, 3384614, 2886508, 1573965, 990618,  3053734,
+        2918742, 4508753, 1032149, 60943,   4291620, 722607,  2883224, 169359,  4356585, 3725543,
+        3678729, 341673,  3592828, 4077251, 3382936, 3885685, 4630994, 1286698, 4449616, 1138430,
+        3113385, 4660578, 2539973, 4562286, 4085089, 494737,  3967610, 2130702, 1823755, 1369324,
+        3796951, 956299,  141730,  935144,  4381893, 4412545, 1382250, 3024476, 2364546, 3396164,
+        3573511, 314081,  577688,  4154135, 1567018, 4047761, 2446220, 1148833, 4842497, 3967186,
+        1175290, 3749667, 1209593, 3295627, 3169065, 2460328, 1838486, 1436923, 2843887, 3676426,
+        2079145, 2975635, 535071,  4287509, 3281107, 39606,   3115500, 3204573, 722131,  3124073},
+      2,
+      false)));
+*/
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/leiden_test.cpp b/cpp/tests/community/leiden_test.cpp
index 764ab8bf6cb..13e139666f6 100644
--- a/cpp/tests/community/leiden_test.cpp
+++ b/cpp/tests/community/leiden_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -10,8 +10,8 @@
  */
 #include <gtest/gtest.h>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <thrust/extrema.h>
 
@@ -19,6 +19,10 @@
 
 TEST(leiden_karate, success)
 {
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
   std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
                             67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
                             98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
@@ -46,27 +50,38 @@ TEST(leiden_karate, success)
 
   std::vector<int> cluster_id(num_verts, -1);
 
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
 
-  cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
+
+  cugraph::legacy::GraphCSRView<int, int, float> G(
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
   float modularity{0.0};
   size_t num_level = 40;
 
-  raft::handle_t handle;
-  std::tie(num_level, modularity) = cugraph::leiden(handle, G, result_v.data().get());
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(cugraph::leiden(handle, G, result_v.data()), cugraph::logic_error);
+  } else {
+    std::tie(num_level, modularity) = cugraph::leiden(handle, G, result_v.data());
+
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
 
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
+    CUDA_TRY(cudaDeviceSynchronize());
 
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.41116042 * 0.99);
+    ASSERT_GE(min, 0);
+    ASSERT_GE(modularity, 0.41116042 * 0.99);
+  }
 }
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
new file mode 100644
index 00000000000..4dca720483e
--- /dev/null
+++ b/cpp/tests/community/louvain_test.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <cugraph/experimental/graph.hpp>
+
+#include <cugraph/algorithms.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+struct Louvain_Usecase {
+  std::string graph_file_full_path_{};
+  bool test_weighted_{false};
+  int expected_level_{0};
+  float expected_modularity_{0};
+
+  Louvain_Usecase(std::string const& graph_file_path,
+                  bool test_weighted,
+                  int expected_level,
+                  float expected_modularity)
+    : test_weighted_(test_weighted),
+      expected_level_(expected_level),
+      expected_modularity_(expected_modularity)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path_ = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path_ = graph_file_path;
+    }
+  };
+};
+
+class Tests_Louvain : public ::testing::TestWithParam<Louvain_Usecase> {
+ public:
+  Tests_Louvain() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_legacy_test(Louvain_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    bool directed{false};
+
+    auto graph = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
+      directed, configuration.graph_file_full_path_);
+    auto graph_view = graph->view();
+
+    // "FIXME": remove this check once we drop support for Pascal
+    //
+    // Calling louvain on Pascal will throw an exception, we'll check that
+    // this is the behavior while we still support Pascal (device_prop.major < 7)
+    //
+    cudaDeviceProp device_prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0));
+
+    if (device_prop.major < 7) {
+      EXPECT_THROW(louvain(graph_view,
+                           graph_view.get_number_of_vertices(),
+                           configuration.expected_level_,
+                           configuration.expected_modularity_),
+                   cugraph::logic_error);
+    } else {
+      louvain(graph_view,
+              graph_view.get_number_of_vertices(),
+              configuration.expected_level_,
+              configuration.expected_modularity_);
+    }
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(Louvain_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+    std::tie(graph, std::ignore) =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+        handle, configuration.graph_file_full_path_, configuration.test_weighted_, false);
+
+    auto graph_view = graph.view();
+
+    // "FIXME": remove this check once we drop support for Pascal
+    //
+    // Calling louvain on Pascal will throw an exception, we'll check that
+    // this is the behavior while we still support Pascal (device_prop.major < 7)
+    //
+    cudaDeviceProp device_prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0));
+
+    if (device_prop.major < 7) {
+      EXPECT_THROW(louvain(graph_view,
+                           graph_view.get_number_of_local_vertices(),
+                           configuration.expected_level_,
+                           configuration.expected_modularity_),
+                   cugraph::logic_error);
+    } else {
+      louvain(graph_view,
+              graph_view.get_number_of_local_vertices(),
+              configuration.expected_level_,
+              configuration.expected_modularity_);
+    }
+  }
+
+  template <typename graph_t>
+  void louvain(graph_t const& graph_view,
+               typename graph_t::vertex_type num_vertices,
+               int expected_level,
+               float expected_modularity)
+  {
+    using vertex_t = typename graph_t::vertex_type;
+    using weight_t = typename graph_t::weight_type;
+
+    raft::handle_t handle{};
+
+    rmm::device_uvector<vertex_t> clustering_v(num_vertices, handle.get_stream());
+    size_t level;
+    weight_t modularity;
+
+    std::tie(level, modularity) =
+      cugraph::louvain(handle, graph_view, clustering_v.data(), size_t{100}, weight_t{1});
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    float compare_modularity = static_cast<float>(modularity);
+
+    ASSERT_FLOAT_EQ(compare_modularity, expected_modularity);
+    ASSERT_EQ(level, expected_level);
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST(louvain_legacy, success)
+{
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
+  std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
+                            67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
+                            98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
+  std::vector<int> ind_h = {
+    1,  2,  3,  4,  5,  6,  7,  8,  10, 11, 12, 13, 17, 19, 21, 31, 0,  2,  3,  7,  13, 17, 19,
+    21, 30, 0,  1,  3,  7,  8,  9,  13, 27, 28, 32, 0,  1,  2,  7,  12, 13, 0,  6,  10, 0,  6,
+    10, 16, 0,  4,  5,  16, 0,  1,  2,  3,  0,  2,  30, 32, 33, 2,  33, 0,  4,  5,  0,  0,  3,
+    0,  1,  2,  3,  33, 32, 33, 32, 33, 5,  6,  0,  1,  32, 33, 0,  1,  33, 32, 33, 0,  1,  32,
+    33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2,  23, 24, 33, 2,  31, 33, 23, 26,
+    32, 33, 1,  8,  32, 33, 0,  24, 25, 28, 32, 33, 2,  8,  14, 15, 18, 20, 22, 23, 29, 30, 31,
+    33, 8,  9,  13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+  std::vector<float> w_h = {
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  std::vector<int> result_h = {1, 1, 1, 1, 0, 0, 0, 1, 3, 1, 0, 1, 1, 1, 3, 3, 0,
+                               1, 3, 1, 3, 1, 3, 2, 2, 2, 3, 2, 1, 3, 3, 2, 3, 3};
+
+  int num_verts = off_h.size() - 1;
+  int num_edges = ind_h.size();
+
+  std::vector<int> cluster_id(num_verts, -1);
+
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
+
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
+
+  cugraph::legacy::GraphCSRView<int, int, float> G(
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
+
+  float modularity{0.0};
+  size_t num_level = 40;
+
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(cugraph::louvain(handle, G, result_v.data()), cugraph::logic_error);
+  } else {
+    std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data());
+
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
+
+    CUDA_TRY(cudaDeviceSynchronize());
+
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
+
+    ASSERT_GE(min, 0);
+    ASSERT_FLOAT_EQ(modularity, 0.408695);
+    ASSERT_EQ(cluster_id, result_h);
+  }
+}
+
+TEST(louvain_legacy_renumbered, success)
+{
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
+  std::vector<int> off_h = {0,   16,  25,  30,  34,  38,  42,  44,  46,  48,  50,  52,
+                            54,  56,  73,  85,  95,  101, 107, 112, 117, 121, 125, 129,
+                            132, 135, 138, 141, 144, 147, 149, 151, 153, 155, 156};
+  std::vector<int> ind_h = {
+    1,  3,  7,  11, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 30, 33, 0,  5,  11, 15, 16, 19, 21,
+    25, 30, 4,  13, 14, 22, 27, 0,  9,  20, 24, 2,  13, 15, 26, 1,  13, 14, 18, 13, 15, 0,  16,
+    13, 14, 3,  20, 13, 14, 0,  1,  13, 22, 2,  4,  5,  6,  8,  10, 12, 14, 17, 18, 19, 22, 25,
+    28, 29, 31, 32, 2,  5,  8,  10, 13, 15, 17, 18, 22, 29, 31, 32, 0,  1,  4,  6,  14, 16, 18,
+    19, 21, 28, 0,  1,  7,  15, 19, 21, 0,  13, 14, 26, 27, 28, 0,  5,  13, 14, 15, 0,  1,  13,
+    16, 16, 0,  3,  9,  23, 0,  1,  15, 16, 2,  12, 13, 14, 0,  20, 24, 0,  3,  23, 0,  1,  13,
+    4,  17, 27, 2,  17, 26, 13, 15, 17, 13, 14, 0,  1,  13, 14, 13, 14, 0};
+
+  std::vector<float> w_h = {
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  int num_verts = off_h.size() - 1;
+  int num_edges = ind_h.size();
+
+  std::vector<int> cluster_id(num_verts, -1);
+
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
+
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
+
+  cugraph::legacy::GraphCSRView<int, int, float> G(
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
+
+  float modularity{0.0};
+  size_t num_level = 40;
+
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(cugraph::louvain(handle, G, result_v.data()), cugraph::logic_error);
+  } else {
+    std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data());
+
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
+
+    CUDA_TRY(cudaDeviceSynchronize());
+
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
+
+    ASSERT_GE(min, 0);
+    ASSERT_FLOAT_EQ(modularity, 0.41880345);
+  }
+}
+
+TEST_P(Tests_Louvain, CheckInt32Int32FloatFloatLegacy)
+{
+  run_legacy_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+TEST_P(Tests_Louvain, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, float>(GetParam());
+}
+
+// FIXME: Expand testing once we evaluate RMM memory use
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_Louvain,
+  ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 3, 0.408695)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/louvain_test.cu b/cpp/tests/community/louvain_test.cu
deleted file mode 100644
index 2bac0097212..00000000000
--- a/cpp/tests/community/louvain_test.cu
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-#include <utilities/base_fixture.hpp>
-
-#include <algorithms.hpp>
-#include <graph.hpp>
-
-#include <thrust/extrema.h>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-TEST(louvain, success)
-{
-  std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
-                            67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
-                            98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
-  std::vector<int> ind_h = {
-    1,  2,  3,  4,  5,  6,  7,  8,  10, 11, 12, 13, 17, 19, 21, 31, 0,  2,  3,  7,  13, 17, 19,
-    21, 30, 0,  1,  3,  7,  8,  9,  13, 27, 28, 32, 0,  1,  2,  7,  12, 13, 0,  6,  10, 0,  6,
-    10, 16, 0,  4,  5,  16, 0,  1,  2,  3,  0,  2,  30, 32, 33, 2,  33, 0,  4,  5,  0,  0,  3,
-    0,  1,  2,  3,  33, 32, 33, 32, 33, 5,  6,  0,  1,  32, 33, 0,  1,  33, 32, 33, 0,  1,  32,
-    33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2,  23, 24, 33, 2,  31, 33, 23, 26,
-    32, 33, 1,  8,  32, 33, 0,  24, 25, 28, 32, 33, 2,  8,  14, 15, 18, 20, 22, 23, 29, 30, 31,
-    33, 8,  9,  13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
-  std::vector<float> w_h = {
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
-  std::vector<int> result_h = {1, 1, 1, 1, 0, 0, 0, 1, 3, 1, 0, 1, 1, 1, 3, 3, 0,
-                               1, 3, 1, 3, 1, 3, 2, 2, 2, 3, 2, 1, 3, 3, 2, 3, 3};
-
-  int num_verts = off_h.size() - 1;
-  int num_edges = ind_h.size();
-
-  std::vector<int> cluster_id(num_verts, -1);
-
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
-
-  cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
-
-  float modularity{0.0};
-  size_t num_level = 40;
-
-  raft::handle_t handle;
-
-  std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data().get());
-
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
-
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
-
-  std::cout << "modularity = " << modularity << std::endl;
-
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.402777 * 0.95);
-  ASSERT_EQ(result_v, result_h);
-}
-
-TEST(louvain_renumbered, success)
-{
-  std::vector<int> off_h = {0,   16,  25,  30,  34,  38,  42,  44,  46,  48,  50,  52,
-                            54,  56,  73,  85,  95,  101, 107, 112, 117, 121, 125, 129,
-                            132, 135, 138, 141, 144, 147, 149, 151, 153, 155, 156
-
-  };
-  std::vector<int> ind_h = {
-    1,  3,  7,  11, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 30, 33, 0,  5,  11, 15, 16, 19, 21,
-    25, 30, 4,  13, 14, 22, 27, 0,  9,  20, 24, 2,  13, 15, 26, 1,  13, 14, 18, 13, 15, 0,  16,
-    13, 14, 3,  20, 13, 14, 0,  1,  13, 22, 2,  4,  5,  6,  8,  10, 12, 14, 17, 18, 19, 22, 25,
-    28, 29, 31, 32, 2,  5,  8,  10, 13, 15, 17, 18, 22, 29, 31, 32, 0,  1,  4,  6,  14, 16, 18,
-    19, 21, 28, 0,  1,  7,  15, 19, 21, 0,  13, 14, 26, 27, 28, 0,  5,  13, 14, 15, 0,  1,  13,
-    16, 16, 0,  3,  9,  23, 0,  1,  15, 16, 2,  12, 13, 14, 0,  20, 24, 0,  3,  23, 0,  1,  13,
-    4,  17, 27, 2,  17, 26, 13, 15, 17, 13, 14, 0,  1,  13, 14, 13, 14, 0};
-
-  std::vector<float> w_h = {
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
-  int num_verts = off_h.size() - 1;
-  int num_edges = ind_h.size();
-
-  std::vector<int> cluster_id(num_verts, -1);
-
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
-
-  cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
-
-  float modularity{0.0};
-  size_t num_level = 40;
-
-  raft::handle_t handle;
-
-  std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data().get());
-
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
-
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
-
-  std::cout << "modularity = " << modularity << std::endl;
-
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.402777 * 0.95);
-}
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
new file mode 100644
index 00000000000..b5347778b4c
--- /dev/null
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_louvain_helper.hpp"
+
+#include <cugraph/experimental/graph.hpp>
+
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <thrust/for_each.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+namespace cugraph {
+namespace test {
+
+template <typename T>
+void single_gpu_renumber_edgelist_given_number_map(raft::handle_t const& handle,
+                                                   rmm::device_uvector<T>& edgelist_rows_v,
+                                                   rmm::device_uvector<T>& edgelist_cols_v,
+                                                   rmm::device_uvector<T>& renumber_map_gathered_v)
+{
+  rmm::device_uvector<T> index_v(renumber_map_gathered_v.size(), handle.get_stream());
+
+  thrust::for_each(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(renumber_map_gathered_v.size()),
+    [d_renumber_map_gathered = renumber_map_gathered_v.data(), d_index = index_v.data()] __device__(
+      auto idx) { d_index[d_renumber_map_gathered[idx]] = idx; });
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    edgelist_rows_v.begin(),
+                    edgelist_rows_v.end(),
+                    edgelist_rows_v.begin(),
+                    [d_index = index_v.data()] __device__(auto v) { return d_index[v]; });
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    edgelist_cols_v.begin(),
+                    edgelist_cols_v.end(),
+                    edgelist_cols_v.begin(),
+                    [d_index = index_v.data()] __device__(auto v) { return d_index[v]; });
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+compressed_sparse_to_edgelist(edge_t const* compressed_sparse_offsets,
+                              vertex_t const* compressed_sparse_indices,
+                              std::optional<weight_t const*> compressed_sparse_weights,
+                              vertex_t major_first,
+                              vertex_t major_last,
+                              cudaStream_t stream)
+{
+  edge_t number_of_edges{0};
+  raft::update_host(
+    &number_of_edges, compressed_sparse_offsets + (major_last - major_first), 1, stream);
+  CUDA_TRY(cudaStreamSynchronize(stream));
+  rmm::device_uvector<vertex_t> edgelist_major_vertices(number_of_edges, stream);
+  rmm::device_uvector<vertex_t> edgelist_minor_vertices(number_of_edges, stream);
+  auto edgelist_weights =
+    compressed_sparse_weights
+      ? std::make_optional<rmm::device_uvector<weight_t>>(number_of_edges, stream)
+      : std::nullopt;
+
+  // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can
+  // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA
+  // warp per vertex, and low-degree vertices using one CUDA thread per block
+  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+                   thrust::make_counting_iterator(major_first),
+                   thrust::make_counting_iterator(major_last),
+                   [compressed_sparse_offsets,
+                    major_first,
+                    p_majors = edgelist_major_vertices.begin()] __device__(auto v) {
+                     auto first = compressed_sparse_offsets[v - major_first];
+                     auto last  = compressed_sparse_offsets[v - major_first + 1];
+                     thrust::fill(thrust::seq, p_majors + first, p_majors + last, v);
+                   });
+  thrust::copy(rmm::exec_policy(stream)->on(stream),
+               compressed_sparse_indices,
+               compressed_sparse_indices + number_of_edges,
+               edgelist_minor_vertices.begin());
+  if (compressed_sparse_weights) {
+    thrust::copy(rmm::exec_policy(stream)->on(stream),
+                 (*compressed_sparse_weights),
+                 (*compressed_sparse_weights) + number_of_edges,
+                 (*edgelist_weights).data());
+  }
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+template <typename vertex_t, typename weight_t>
+void sort_and_coarsen_edgelist(
+  rmm::device_uvector<vertex_t>& edgelist_major_vertices /* [INOUT] */,
+  rmm::device_uvector<vertex_t>& edgelist_minor_vertices /* [INOUT] */,
+  std::optional<rmm::device_uvector<weight_t>>& edgelist_weights /* [INOUT] */,
+  cudaStream_t stream)
+{
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+
+  size_t number_of_edges{0};
+  if (edgelist_weights) {
+    thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                        pair_first,
+                        pair_first + edgelist_major_vertices.size(),
+                        (*edgelist_weights).begin());
+
+    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<vertex_t> tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
+    auto it = thrust::reduce_by_key(
+      rmm::exec_policy(stream)->on(stream),
+      pair_first,
+      pair_first + edgelist_major_vertices.size(),
+      (*edgelist_weights).begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
+                                                   tmp_edgelist_minor_vertices.begin())),
+      tmp_edgelist_weights.begin());
+    number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it));
+
+    edgelist_major_vertices = std::move(tmp_edgelist_major_vertices);
+    edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices);
+    (*edgelist_weights)     = std::move(tmp_edgelist_weights);
+  } else {
+    thrust::sort(rmm::exec_policy(stream)->on(stream),
+                 pair_first,
+                 pair_first + edgelist_major_vertices.size());
+    auto it         = thrust::unique(rmm::exec_policy(stream)->on(stream),
+                             pair_first,
+                             pair_first + edgelist_major_vertices.size());
+    number_of_edges = thrust::distance(pair_first, it);
+  }
+
+  edgelist_major_vertices.resize(number_of_edges, stream);
+  edgelist_minor_vertices.resize(number_of_edges, stream);
+  edgelist_major_vertices.shrink_to_fit(stream);
+  edgelist_minor_vertices.shrink_to_fit(stream);
+  if (edgelist_weights) {
+    (*edgelist_weights).resize(number_of_edges, stream);
+    (*edgelist_weights).shrink_to_fit(stream);
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+  edge_t const* compressed_sparse_offsets,
+  vertex_t const* compressed_sparse_indices,
+  std::optional<weight_t const*> compressed_sparse_weights,
+  vertex_t const* p_major_labels,
+  vertex_t const* p_minor_labels,
+  vertex_t major_first,
+  vertex_t major_last,
+  vertex_t minor_first,
+  vertex_t minor_last,
+  cudaStream_t stream)
+{
+  // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
+  // compressed sparse format to save memory
+
+  auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] =
+    compressed_sparse_to_edgelist(compressed_sparse_offsets,
+                                  compressed_sparse_indices,
+                                  compressed_sparse_weights,
+                                  major_first,
+                                  major_last,
+                                  stream);
+
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+  thrust::transform(
+    rmm::exec_policy(stream)->on(stream),
+    pair_first,
+    pair_first + edgelist_major_vertices.size(),
+    pair_first,
+    [p_major_labels, p_minor_labels, major_first, minor_first] __device__(auto val) {
+      return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first],
+                                p_minor_labels[thrust::get<1>(val) - minor_first]);
+    });
+
+  sort_and_coarsen_edgelist(
+    edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream);
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+// FIXME: better add "bool renumber" (which must be false in MG) to the coarsen_grpah function
+// instead of replicating the code here. single-GPU version
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+std::unique_ptr<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, store_transposed, false> const&
+    graph_view,
+  vertex_t const* labels)
+{
+  auto [coarsened_edgelist_major_vertices,
+        coarsened_edgelist_minor_vertices,
+        coarsened_edgelist_weights] =
+    compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+      graph_view.get_matrix_partition_view().get_offsets(),
+      graph_view.get_matrix_partition_view().get_indices(),
+      graph_view.get_matrix_partition_view().get_weights(),
+      labels,
+      labels,
+      vertex_t{0},
+      graph_view.get_number_of_vertices(),
+      vertex_t{0},
+      graph_view.get_number_of_vertices(),
+      handle.get_stream());
+
+  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{};
+  edgelist.p_src_vertices  = store_transposed ? coarsened_edgelist_minor_vertices.data()
+                                              : coarsened_edgelist_major_vertices.data();
+  edgelist.p_dst_vertices  = store_transposed ? coarsened_edgelist_major_vertices.data()
+                                              : coarsened_edgelist_minor_vertices.data();
+  edgelist.p_edge_weights  = coarsened_edgelist_weights
+                               ? std::optional<weight_t const*>{(*coarsened_edgelist_weights).data()}
+                               : std::nullopt;
+  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+
+  vertex_t new_number_of_vertices =
+    1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       labels,
+                       labels + graph_view.get_number_of_vertices(),
+                       vertex_t{0},
+                       thrust::maximum<vertex_t>());
+
+  return std::make_unique<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>(
+    handle,
+    edgelist,
+    new_number_of_vertices,
+    cugraph::experimental::graph_properties_t{graph_view.is_symmetric(), false},
+    std::nullopt);
+}
+
+// explicit instantiation
+
+template void single_gpu_renumber_edgelist_given_number_map(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int>& d_edgelist_rows,
+  rmm::device_uvector<int>& d_edgelist_cols,
+  rmm::device_uvector<int>& d_renumber_map_gathered_v);
+
+template std::unique_ptr<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  cugraph::experimental::graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+  int32_t const* labels);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/community/mg_louvain_helper.hpp b/cpp/tests/community/mg_louvain_helper.hpp
new file mode 100644
index 00000000000..7e03f435724
--- /dev/null
+++ b/cpp/tests/community/mg_louvain_helper.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/experimental/graph.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t>
+bool compare_renumbered_vectors(raft::handle_t const& handle,
+                                std::vector<vertex_t> const& v1,
+                                std::vector<vertex_t> const& v2);
+
+template <typename vertex_t>
+bool compare_renumbered_vectors(raft::handle_t const& handle,
+                                rmm::device_uvector<vertex_t> const& v1,
+                                rmm::device_uvector<vertex_t> const& v2);
+
+template <typename T>
+void single_gpu_renumber_edgelist_given_number_map(
+  raft::handle_t const& handle,
+  rmm::device_uvector<T>& d_edgelist_rows,
+  rmm::device_uvector<T>& d_edgelist_cols,
+  rmm::device_uvector<T>& d_renumber_map_gathered_v);
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+std::unique_ptr<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, store_transposed, false> const&
+    graph_view,
+  vertex_t const* labels);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
new file mode 100644
index 00000000000..1f7276fa116
--- /dev/null
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_louvain_helper.hpp"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+
+#include <thrust/sequence.h>
+
+#include <gtest/gtest.h>
+
+void compare(float mg_modularity, float sg_modularity)
+{
+  ASSERT_FLOAT_EQ(mg_modularity, sg_modularity);
+}
+void compare(double mg_modularity, double sg_modularity)
+{
+  ASSERT_DOUBLE_EQ(mg_modularity, sg_modularity);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Test param object. This defines the input and expected output for a test, and
+// will be instantiated as the parameter to the tests defined below using
+// INSTANTIATE_TEST_SUITE_P()
+//
+struct Louvain_Usecase {
+  std::string graph_file_full_path{};
+  bool weighted{false};
+  size_t max_level;
+  double resolution;
+
+  // FIXME:  We really should have a Graph_Testparms_Base class or something
+  //         like that which can handle this graph_full_path thing.
+  //
+  Louvain_Usecase(std::string const& graph_file_path,
+                  bool weighted,
+                  size_t max_level,
+                  double resolution)
+    : weighted(weighted), max_level(max_level), resolution(resolution)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameterized test fixture, to be used with TEST_P().  This defines common
+// setup and teardown steps as well as common utilities used by each E2E MG
+// test.  In this case, each test is identical except for the inputs and
+// expected outputs, so the entire test is defined in the run_test() method.
+//
+class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase> {
+ public:
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  // Run once for each test instance
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of MNMG Louvain with the results of running
+  // each step of SG Louvain, renumbering the coarsened graphs based
+  // on the MNMG renumbering.
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void compare_sg_results(raft::handle_t const& handle,
+                          std::string const& graph_filename,
+                          rmm::device_uvector<vertex_t>& d_renumber_map_gathered_v,
+                          cugraph::Dendrogram<vertex_t> const& dendrogram,
+                          weight_t resolution,
+                          int rank,
+                          weight_t mg_modularity)
+  {
+    auto sg_graph =
+      std::make_unique<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false>>(
+        handle);
+    rmm::device_uvector<vertex_t> d_clustering_v(0, handle.get_stream());
+    weight_t sg_modularity{-1.0};
+
+    if (rank == 0) {
+      // Create initial SG graph, renumbered according to the MNMG renumber map
+
+      auto [d_edgelist_rows,
+            d_edgelist_cols,
+            d_edgelist_weights,
+            number_of_vertices,
+            is_symmetric] =
+        cugraph::test::read_edgelist_from_matrix_market_file<vertex_t, weight_t>(
+          handle, graph_filename, true);
+
+      rmm::device_uvector<vertex_t> d_vertices(number_of_vertices, handle.get_stream());
+      std::vector<vertex_t> h_vertices(number_of_vertices);
+
+      d_clustering_v.resize(d_vertices.size(), handle.get_stream());
+
+      thrust::sequence(thrust::host, h_vertices.begin(), h_vertices.end(), vertex_t{0});
+      raft::update_device(
+        d_vertices.data(), h_vertices.data(), d_vertices.size(), handle.get_stream());
+
+      // renumber using d_renumber_map_gathered_v
+      cugraph::test::single_gpu_renumber_edgelist_given_number_map(
+        handle, d_edgelist_rows, d_edgelist_cols, d_renumber_map_gathered_v);
+
+      std::tie(*sg_graph, std::ignore) =
+        cugraph::experimental::create_graph_from_edgelist<vertex_t, edge_t, weight_t, false, false>(
+          handle,
+          std::optional<std::tuple<vertex_t const*, vertex_t>>{
+            std::make_tuple(d_vertices.data(), static_cast<vertex_t>(d_vertices.size()))},
+          std::move(d_edgelist_rows),
+          std::move(d_edgelist_cols),
+          std::move(d_edgelist_weights),
+          cugraph::experimental::graph_properties_t{is_symmetric, false},
+          false);
+    }
+
+    std::for_each(
+      thrust::make_counting_iterator<size_t>(0),
+      thrust::make_counting_iterator<size_t>(dendrogram.num_levels()),
+      [&dendrogram, &sg_graph, &d_clustering_v, &sg_modularity, &handle, resolution, rank](
+        size_t i) {
+        auto d_dendrogram_gathered_v = cugraph::test::device_gatherv(
+          handle, dendrogram.get_level_ptr_nocheck(i), dendrogram.get_level_size_nocheck(i));
+
+        if (rank == 0) {
+          auto graph_view = sg_graph->view();
+
+          d_clustering_v.resize(graph_view.get_number_of_vertices(), handle.get_stream());
+
+          std::tie(std::ignore, sg_modularity) =
+            cugraph::louvain(handle, graph_view, d_clustering_v.data(), size_t{1}, resolution);
+
+          EXPECT_TRUE(cugraph::test::renumbered_vectors_same(
+            handle, d_clustering_v, d_dendrogram_gathered_v));
+
+          sg_graph =
+            cugraph::test::coarsen_graph(handle, graph_view, d_dendrogram_gathered_v.data());
+        }
+      });
+
+    if (rank == 0) compare(mg_modularity, sg_modularity);
+  }
+
+  // Compare the results of running louvain on multiple GPUs to that of a
+  // single-GPU run for the configuration in param.  Note that MNMG Louvain
+  // and single GPU Louvain are ONLY deterministic through a single
+  // iteration of the outer loop.  Renumbering of the partitions when coarsening
+  // the graph is a function of the number of GPUs in the GPU cluster.
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_test(const Louvain_Usecase& param)
+  {
+    raft::handle_t handle;
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    const auto& comm = handle.get_comms();
+
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    cudaStream_t stream = handle.get_stream();
+
+    auto [mg_graph, d_renumber_map_labels] =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, true>(
+        handle, param.graph_file_full_path, true, true);
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::unique_ptr<cugraph::Dendrogram<vertex_t>> dendrogram;
+    weight_t mg_modularity;
+
+    std::tie(dendrogram, mg_modularity) =
+      cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution);
+
+    SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path);
+
+    auto d_renumber_map_gathered_v = cugraph::test::device_gatherv(
+      handle, (*d_renumber_map_labels).data(), (*d_renumber_map_labels).size());
+
+    compare_sg_results<vertex_t, edge_t, weight_t>(handle,
+                                                   param.graph_file_full_path,
+                                                   d_renumber_map_gathered_v,
+                                                   *dendrogram,
+                                                   param.resolution,
+                                                   comm_rank,
+                                                   mg_modularity);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float)
+{
+  run_test<int32_t, int32_t, float>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Louvain_MG_Testfixture,
+  ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1)
+                    //,Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1)
+                    ));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/triangle_test.cu b/cpp/tests/community/triangle_test.cu
index 1c5c99261d2..4c51e15b111 100644
--- a/cpp/tests/community/triangle_test.cu
+++ b/cpp/tests/community/triangle_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -10,8 +10,8 @@
  */
 #include <utilities/base_fixture.hpp>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -49,7 +49,7 @@ TEST(triangle, dolphin)
   rmm::device_vector<int> indices_v(ind_h);
   rmm::device_vector<float> weights_v(w_h);
 
-  cugraph::GraphCSRView<int, int, float> graph_csr(
+  cugraph::legacy::GraphCSRView<int, int, float> graph_csr(
     offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
 
   uint64_t count{0};
@@ -63,4 +63,47 @@ TEST(triangle, dolphin)
   ASSERT_EQ(count, expected);
 }
 
+TEST(triangle, karate)
+{
+  using vertex_t = int32_t;
+  using edge_t   = int32_t;
+  using weight_t = float;
+
+  std::vector<vertex_t> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
+                                 67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
+                                 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
+  std::vector<vertex_t> ind_h = {
+    1,  2,  3,  4,  5,  6,  7,  8,  10, 11, 12, 13, 17, 19, 21, 31, 0,  2,  3,  7,  13, 17, 19,
+    21, 30, 0,  1,  3,  7,  8,  9,  13, 27, 28, 32, 0,  1,  2,  7,  12, 13, 0,  6,  10, 0,  6,
+    10, 16, 0,  4,  5,  16, 0,  1,  2,  3,  0,  2,  30, 32, 33, 2,  33, 0,  4,  5,  0,  0,  3,
+    0,  1,  2,  3,  33, 32, 33, 32, 33, 5,  6,  0,  1,  32, 33, 0,  1,  33, 32, 33, 0,  1,  32,
+    33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2,  23, 24, 33, 2,  31, 33, 23, 26,
+    32, 33, 1,  8,  32, 33, 0,  24, 25, 28, 32, 33, 2,  8,  14, 15, 18, 20, 22, 23, 29, 30, 31,
+    33, 8,  9,  13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+
+  std::vector<weight_t> w_h(ind_h.size(), weight_t{1.0});
+
+  vertex_t num_verts = off_h.size() - 1;
+  int num_edges      = ind_h.size();
+
+  uint64_t expected{135};
+
+  rmm::device_vector<vertex_t> offsets_v(off_h);
+  rmm::device_vector<vertex_t> indices_v(ind_h);
+  rmm::device_vector<weight_t> weights_v(w_h);
+
+  cugraph::legacy::GraphCSRView<vertex_t, vertex_t, weight_t> graph_csr(
+    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+
+  uint64_t count{0};
+
+  try {
+    count = cugraph::triangle::triangle_count<vertex_t, vertex_t, weight_t>(graph_csr);
+  } catch (std::exception& e) {
+    std::cout << "Exception: " << e.what() << std::endl;
+  }
+
+  ASSERT_EQ(count, expected);
+}
+
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/con_comp_test.cu b/cpp/tests/components/con_comp_test.cu
index 15d60867753..2db7235c299 100644
--- a/cpp/tests/components/con_comp_test.cu
+++ b/cpp/tests/components/con_comp_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -18,9 +18,9 @@
 
 #include <cuda_profiler_api.h>
 
-#include <algorithms.hpp>
 #include <converters/COOtoCSR.cuh>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <algorithm>
 #include <iterator>
@@ -113,9 +113,10 @@ struct Tests_Weakly_CC : ::testing::TestWithParam<Usecase> {
       << "\n";
     ASSERT_EQ(fclose(fpin), 0);
 
-    cugraph::GraphCOOView<int, int, float> G_coo(&cooRowInd[0], &cooColInd[0], nullptr, m, nnz);
-    auto G_unique                            = cugraph::coo_to_csr(G_coo);
-    cugraph::GraphCSRView<int, int, float> G = G_unique->view();
+    cugraph::legacy::GraphCOOView<int, int, float> G_coo(
+      &cooRowInd[0], &cooColInd[0], nullptr, m, nnz);
+    auto G_unique                                    = cugraph::coo_to_csr(G_coo);
+    cugraph::legacy::GraphCSRView<int, int, float> G = G_unique->view();
 
     rmm::device_vector<int> d_labels(m);
 
@@ -141,11 +142,11 @@ std::vector<double> Tests_Weakly_CC::weakly_cc_time;
 TEST_P(Tests_Weakly_CC, Weakly_CC) { run_current_test(GetParam()); }
 
 // --gtest_filter=*simple_test*
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_Weakly_CC,
-                        ::testing::Values(Usecase("test/datasets/dolphins.mtx"),
-                                          Usecase("test/datasets/coPapersDBLP.mtx"),
-                                          Usecase("test/datasets/coPapersCiteseer.mtx"),
-                                          Usecase("test/datasets/hollywood.mtx")));
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_Weakly_CC,
+                         ::testing::Values(Usecase("test/datasets/dolphins.mtx"),
+                                           Usecase("test/datasets/coPapersDBLP.mtx"),
+                                           Usecase("test/datasets/coPapersCiteseer.mtx"),
+                                           Usecase("test/datasets/hollywood.mtx")));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/mg_weakly_connected_components_test.cpp b/cpp/tests/components/mg_weakly_connected_components_test.cpp
new file mode 100644
index 00000000000..11e3f803b38
--- /dev/null
+++ b/cpp/tests/components/mg_weakly_connected_components_test.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+struct WeaklyConnectedComponents_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGWeaklyConnectedComponents
+  : public ::testing::TestWithParam<
+      std::tuple<WeaklyConnectedComponents_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGWeaklyConnectedComponents() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running weakly connected components on multiple GPUs to that of a
+  // single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(
+    WeaklyConnectedComponents_Usecase const& weakly_connected_components_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        handle, false, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 3. run MG weakly connected components
+
+    rmm::device_uvector<vertex_t> d_mg_components(mg_graph_view.get_number_of_local_vertices(),
+                                                  handle.get_stream());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    cugraph::experimental::weakly_connected_components(
+      handle, mg_graph_view, d_mg_components.data());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG weakly_connected_components took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    // 4. compare SG & MG results
+
+    if (weakly_connected_components_usecase.check_correctness) {
+      // 4-1. aggregate MG results
+
+      auto d_mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
+        handle, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
+      auto d_mg_aggregate_components =
+        cugraph::test::device_gatherv(handle, d_mg_components.data(), d_mg_components.size());
+
+      if (handle.get_comms().get_rank() == int{0}) {
+        // 4-2. unrenumbr MG results
+
+        std::tie(std::ignore, d_mg_aggregate_components) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_components.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+
+        // 4-3. create SG graph
+
+        cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
+        std::tie(sg_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+            handle, false, false);
+
+        auto sg_graph_view = sg_graph.view();
+
+        ASSERT_TRUE(mg_graph_view.get_number_of_vertices() ==
+                    sg_graph_view.get_number_of_vertices());
+
+        // 4-4. run SG weakly connected components
+
+        rmm::device_uvector<vertex_t> d_sg_components(sg_graph_view.get_number_of_vertices(),
+                                                      handle.get_stream());
+
+        cugraph::experimental::weakly_connected_components(
+          handle, sg_graph_view, d_sg_components.data());
+
+        // 4-5. compare
+
+        std::vector<vertex_t> h_mg_aggregate_components(mg_graph_view.get_number_of_vertices());
+        raft::update_host(h_mg_aggregate_components.data(),
+                          d_mg_aggregate_components.data(),
+                          d_mg_aggregate_components.size(),
+                          handle.get_stream());
+
+        std::vector<vertex_t> h_sg_components(sg_graph_view.get_number_of_vertices());
+        raft::update_host(h_sg_components.data(),
+                          d_sg_components.data(),
+                          d_sg_components.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        std::unordered_map<vertex_t, vertex_t> mg_to_sg_map{};
+        for (size_t i = 0; i < h_sg_components.size(); ++i) {
+          mg_to_sg_map.insert({h_mg_aggregate_components[i], h_sg_components[i]});
+        }
+        std::transform(h_mg_aggregate_components.begin(),
+                       h_mg_aggregate_components.end(),
+                       h_mg_aggregate_components.begin(),
+                       [&mg_to_sg_map](auto mg_c) { return mg_to_sg_map[mg_c]; });
+
+        ASSERT_TRUE(std::equal(
+          h_sg_components.begin(), h_sg_components.end(), h_mg_aggregate_components.begin()))
+          << "components do not match with the SG values.";
+      }
+    }
+  }
+};
+
+using Tests_MGWeaklyConnectedComponents_File =
+  Tests_MGWeaklyConnectedComponents<cugraph::test::File_Usecase>;
+using Tests_MGWeaklyConnectedComponents_Rmat =
+  Tests_MGWeaklyConnectedComponents<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGWeaklyConnectedComponents_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGWeaklyConnectedComponents_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGWeaklyConnectedComponents_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(WeaklyConnectedComponents_Usecase{0}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/polbooks.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGWeaklyConnectedComponents_Rmat,
+                         ::testing::Values(
+                           // enable correctness checks
+                           std::make_tuple(WeaklyConnectedComponents_Usecase{},
+                                           cugraph::test::Rmat_Usecase(
+                                             10, 16, 0.57, 0.19, 0.19, 0, true, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_large_test,
+                         Tests_MGWeaklyConnectedComponents_Rmat,
+                         ::testing::Values(
+                           // disable correctness checks
+                           std::make_tuple(WeaklyConnectedComponents_Usecase{false},
+                                           cugraph::test::Rmat_Usecase(
+                                             20, 16, 0.57, 0.19, 0.19, 0, true, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/scc_test.cu b/cpp/tests/components/scc_test.cu
index 9d5b55f34c6..eaf6daf2d29 100644
--- a/cpp/tests/components/scc_test.cu
+++ b/cpp/tests/components/scc_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -16,14 +16,17 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
-#include <algorithms.hpp>
 #include <components/scc_matrix.cuh>
 #include <converters/COOtoCSR.cuh>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 #include <topology/topology.cuh>
 
 #include <cuda_profiler_api.h>
 
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
 
@@ -57,41 +60,48 @@ struct Usecase {
   std::string matrix_file;
 };
 
-// checker of counts of labels for each component
-// expensive, for testing purposes only;
+// counts number of vertices in each component;
+// (of same label);
+// potentially expensive, for testing purposes only;
 //
 // params:
-// p_d_labels: device array of labels of size nrows;
-// nrows: |V| for graph G(V, E);
-// d_v_counts: #labels for each component; (_not_ pre-allocated!)
+// in: p_d_labels: device array of labels of size nrows;
+// in: nrows: |V| for graph G(V, E);
+// out: d_v_counts: #labels for each component; (_not_ pre-allocated!)
+// return: number of components;
 //
 template <typename IndexT>
-size_t get_component_sizes(const IndexT* p_d_labels, size_t nrows, DVector<size_t>& d_v_counts)
+size_t get_component_sizes(const IndexT* p_d_labels,
+                           size_t nrows,
+                           DVector<size_t>& d_num_vs_per_component)
 {
   DVector<IndexT> d_sorted_l(p_d_labels, p_d_labels + nrows);
   thrust::sort(d_sorted_l.begin(), d_sorted_l.end());
 
-  size_t counts =
-    thrust::distance(d_sorted_l.begin(), thrust::unique(d_sorted_l.begin(), d_sorted_l.end()));
+  auto pair_it = thrust::reduce_by_key(d_sorted_l.begin(),
+                                       d_sorted_l.end(),
+                                       thrust::make_constant_iterator<size_t>(1),
+                                       thrust::make_discard_iterator(),  // ignore...
+                                       d_num_vs_per_component.begin());
 
-  IndexT* p_d_srt_l = d_sorted_l.data().get();
-
-  d_v_counts.resize(counts);
-  thrust::transform(
-    thrust::device,
-    d_sorted_l.begin(),
-    d_sorted_l.begin() + counts,
-    d_v_counts.begin(),
-    [p_d_srt_l, counts] __device__(IndexT indx) {
-      return thrust::count_if(
-        thrust::seq, p_d_srt_l, p_d_srt_l + counts, [indx](IndexT label) { return label == indx; });
-    });
-
-  // sort the counts:
-  thrust::sort(d_v_counts.begin(), d_v_counts.end());
+  size_t counts = thrust::distance(d_num_vs_per_component.begin(), pair_it.second);
 
+  d_num_vs_per_component.resize(counts);
   return counts;
 }
+
+template <typename ByteT, typename IndexT>
+DVector<IndexT> byte_matrix_to_int(const DVector<ByteT>& d_adj_byte_matrix)
+{
+  auto n2 = d_adj_byte_matrix.size();
+  thrust::device_vector<IndexT> d_vec_matrix(n2, 0);
+  thrust::transform(d_adj_byte_matrix.begin(),
+                    d_adj_byte_matrix.end(),
+                    d_vec_matrix.begin(),
+                    [] __device__(auto byte_v) { return static_cast<IndexT>(byte_v); });
+  return d_vec_matrix;
+}
+
 }  // namespace
 
 struct Tests_Strongly_CC : ::testing::TestWithParam<Usecase> {
@@ -105,7 +115,8 @@ struct Tests_Strongly_CC : ::testing::TestWithParam<Usecase> {
       }
 
       std::cout << "#iterations:\n";
-      for (auto&& count : strongly_cc_counts) std::cout << count << std::endl;
+      for (auto&& count : strongly_cc_counts)
+        std::cout << count << std::endl;
     }
   }
   virtual void SetUp() {}
@@ -154,8 +165,8 @@ struct Tests_Strongly_CC : ::testing::TestWithParam<Usecase> {
     // Allocate memory on host
     std::vector<IndexT> cooRowInd(nnz);
     std::vector<IndexT> cooColInd(nnz);
-    std::vector<IndexT> labels(m);  // for G(V, E), m := |V|
-    std::vector<IndexT> verts(m);
+    std::vector<IndexT> labels(nrows);  // for G(V, E), m := |V|
+    std::vector<IndexT> verts(nrows);
 
     // Read: COO Format
     //
@@ -166,11 +177,12 @@ struct Tests_Strongly_CC : ::testing::TestWithParam<Usecase> {
       << "\n";
     ASSERT_EQ(fclose(fpin), 0);
 
-    cugraph::GraphCOOView<int, int, float> G_coo(&cooRowInd[0], &cooColInd[0], nullptr, m, nnz);
-    auto G_unique                            = cugraph::coo_to_csr(G_coo);
-    cugraph::GraphCSRView<int, int, float> G = G_unique->view();
+    cugraph::legacy::GraphCOOView<int, int, float> G_coo(
+      &cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz);
+    auto G_unique                                    = cugraph::coo_to_csr(G_coo);
+    cugraph::legacy::GraphCSRView<int, int, float> G = G_unique->view();
 
-    rmm::device_vector<int> d_labels(m);
+    rmm::device_vector<int> d_labels(nrows);
 
     size_t count = 0;
 
@@ -190,7 +202,7 @@ struct Tests_Strongly_CC : ::testing::TestWithParam<Usecase> {
     }
     strongly_cc_counts.push_back(count);
 
-    DVector<size_t> d_counts;
+    DVector<size_t> d_counts(nrows);
     auto count_labels = get_component_sizes(d_labels.data().get(), nrows, d_counts);
   }
 };
@@ -201,11 +213,220 @@ std::vector<int> Tests_Strongly_CC::strongly_cc_counts;
 TEST_P(Tests_Strongly_CC, Strongly_CC) { run_current_test(GetParam()); }
 
 // --gtest_filter=*simple_test*
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_Strongly_CC,
   ::testing::Values(
     Usecase("test/datasets/cage6.mtx")  // DG "small" enough to meet SCC GPU memory requirements
     ));
 
+struct SCCSmallTest : public ::testing::Test {
+};
+
+// FIXME: we should take advantage of gtest parameterization over copy-and-paste reuse.
+//
+TEST_F(SCCSmallTest, CustomGraphSimpleLoops)
+{
+  using IndexT = int;
+
+  size_t nrows = 5;
+  size_t n2    = 2 * nrows * nrows;
+
+  cudaDeviceProp prop;
+  int device = 0;
+  cudaGetDeviceProperties(&prop, device);
+
+  ASSERT_TRUE(n2 < prop.totalGlobalMem);
+
+  // Allocate memory on host
+  std::vector<IndexT> cooRowInd{0, 1, 2, 3, 3, 4};
+  std::vector<IndexT> cooColInd{1, 0, 0, 1, 4, 3};
+  std::vector<IndexT> labels(nrows);
+  std::vector<IndexT> verts(nrows);
+
+  size_t nnz = cooRowInd.size();
+
+  EXPECT_EQ(nnz, cooColInd.size());
+
+  cugraph::legacy::GraphCOOView<int, int, float> G_coo(
+    &cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz);
+  auto G_unique                                    = cugraph::coo_to_csr(G_coo);
+  cugraph::legacy::GraphCSRView<int, int, float> G = G_unique->view();
+
+  rmm::device_vector<IndexT> d_labels(nrows);
+
+  cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get());
+
+  DVector<size_t> d_counts(nrows);
+  auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts);
+
+  EXPECT_EQ(count_components, static_cast<size_t>(3));
+
+  std::vector<size_t> v_counts(d_counts.size());
+
+  cudaMemcpy(v_counts.data(),
+             d_counts.data().get(),
+             sizeof(size_t) * v_counts.size(),
+             cudaMemcpyDeviceToHost);
+
+  cudaDeviceSynchronize();
+
+  std::vector<size_t> v_counts_exp{2, 1, 2};
+
+  EXPECT_EQ(v_counts, v_counts_exp);
+}
+
+TEST_F(SCCSmallTest, /*DISABLED_*/ CustomGraphWithSelfLoops)
+{
+  using IndexT = int;
+
+  size_t nrows = 5;
+  size_t n2    = 2 * nrows * nrows;
+
+  cudaDeviceProp prop;
+  int device = 0;
+  cudaGetDeviceProperties(&prop, device);
+
+  ASSERT_TRUE(n2 < prop.totalGlobalMem);
+
+  // Allocate memory on host
+  std::vector<IndexT> cooRowInd{0, 0, 1, 1, 2, 2, 3, 3, 4};
+  std::vector<IndexT> cooColInd{0, 1, 0, 1, 0, 2, 1, 3, 4};
+  std::vector<IndexT> labels(nrows);
+  std::vector<IndexT> verts(nrows);
+
+  size_t nnz = cooRowInd.size();
+
+  EXPECT_EQ(nnz, cooColInd.size());
+
+  cugraph::legacy::GraphCOOView<int, int, float> G_coo(
+    &cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz);
+  auto G_unique                                    = cugraph::coo_to_csr(G_coo);
+  cugraph::legacy::GraphCSRView<int, int, float> G = G_unique->view();
+
+  rmm::device_vector<IndexT> d_labels(nrows);
+
+  cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get());
+
+  DVector<size_t> d_counts(nrows);
+  auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts);
+
+  EXPECT_EQ(count_components, static_cast<size_t>(4));
+
+  std::vector<size_t> v_counts(d_counts.size());
+
+  cudaMemcpy(v_counts.data(),
+             d_counts.data().get(),
+             sizeof(size_t) * v_counts.size(),
+             cudaMemcpyDeviceToHost);
+
+  cudaDeviceSynchronize();
+
+  std::vector<size_t> v_counts_exp{2, 1, 1, 1};
+
+  EXPECT_EQ(v_counts, v_counts_exp);
+}
+
+TEST_F(SCCSmallTest, SmallGraphWithSelfLoops1)
+{
+  using IndexT = int;
+
+  size_t nrows = 3;
+
+  std::vector<IndexT> cooRowInd{0, 0, 1, 2};
+  std::vector<IndexT> cooColInd{0, 1, 0, 0};
+
+  std::vector<size_t> v_counts_exp{2, 1};
+
+  std::vector<IndexT> labels(nrows);
+  std::vector<IndexT> verts(nrows);
+
+  size_t nnz = cooRowInd.size();
+
+  EXPECT_EQ(nnz, cooColInd.size());
+
+  cugraph::legacy::GraphCOOView<int, int, float> G_coo(
+    &cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz);
+  auto G_unique                                    = cugraph::coo_to_csr(G_coo);
+  cugraph::legacy::GraphCSRView<int, int, float> G = G_unique->view();
+
+  rmm::device_vector<IndexT> d_labels(nrows);
+
+  cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get());
+
+  DVector<size_t> d_counts(nrows);
+  auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts);
+
+  // std::cout << "vertex labels:\n";
+  // print_v(d_labels, std::cout);
+
+  decltype(count_components) num_components_exp = 2;
+
+  EXPECT_EQ(count_components, num_components_exp);
+}
+
+TEST_F(SCCSmallTest, SmallGraphWithIsolated)
+{
+  using IndexT = int;
+
+  size_t nrows = 3;
+
+  std::vector<IndexT> cooRowInd{0, 0, 1};
+  std::vector<IndexT> cooColInd{0, 1, 0};
+
+  std::vector<size_t> v_counts_exp{2, 1};
+
+  std::vector<IndexT> labels(nrows);
+  std::vector<IndexT> verts(nrows);
+
+  size_t nnz = cooRowInd.size();
+
+  EXPECT_EQ(nnz, cooColInd.size());
+
+  // Note: there seems to be a BUG in coo_to_csr() or view()
+  // COO format doesn't account for isolated vertices;
+  //
+  // cugraph::legacy::GraphCOOView<int, int, float> G_coo(&cooRowInd[0], &cooColInd[0], nullptr,
+  // nrows, nnz); auto G_unique                            = cugraph::coo_to_csr(G_coo);
+  // cugraph::legacy::GraphCSRView<int, int, float> G = G_unique->view();
+  //
+  //
+  // size_t num_vertices = G.number_of_vertices;
+  // size_t num_edges    = G.number_of_edges;
+  //
+  // EXPECT_EQ(num_vertices, nrows); //fails when G was constructed from COO
+  // EXPECT_EQ(num_edges, nnz);
+
+  std::vector<IndexT> ro{0, 2, 3, 3};
+  std::vector<IndexT> ci{0, 1, 0};
+
+  nnz = ci.size();
+
+  thrust::device_vector<IndexT> d_ro(ro);
+  thrust::device_vector<IndexT> d_ci(ci);
+
+  cugraph::legacy::GraphCSRView<int, int, float> G{
+    d_ro.data().get(), d_ci.data().get(), nullptr, static_cast<int>(nrows), static_cast<int>(nnz)};
+
+  size_t num_vertices = G.number_of_vertices;
+  size_t num_edges    = G.number_of_edges;
+
+  EXPECT_EQ(num_vertices, nrows);
+  EXPECT_EQ(num_edges, nnz);
+
+  rmm::device_vector<IndexT> d_labels(nrows);
+
+  cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get());
+
+  DVector<size_t> d_counts(nrows);
+  auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts);
+
+  // std::cout << "vertex labels:\n";
+  // print_v(d_labels, std::cout);
+
+  decltype(count_components) num_components_exp = 2;
+
+  EXPECT_EQ(count_components, num_components_exp);
+}
+
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/wcc_graphs.cu b/cpp/tests/components/wcc_graphs.cu
new file mode 100644
index 00000000000..3ceebfd46a2
--- /dev/null
+++ b/cpp/tests/components/wcc_graphs.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+#include <components/wcc_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/sort.h>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+LineGraph_Usecase::construct_graph(raft::handle_t const& handle,
+                                   bool test_weighted,
+                                   bool renumber) const
+{
+  uint64_t seed{0};
+
+  edge_t num_edges = 2 * (num_vertices_ - 1);
+
+  rmm::device_uvector<vertex_t> vertices_v(num_vertices_, handle.get_stream());
+  rmm::device_uvector<vertex_t> src_v(num_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(num_edges, handle.get_stream());
+  rmm::device_uvector<double> order_v(num_vertices_, handle.get_stream());
+
+  thrust::sequence(
+    rmm::exec_policy(handle.get_stream()), vertices_v.begin(), vertices_v.end(), vertex_t{0});
+
+  cugraph::detail::uniform_random_fill(
+    handle.get_stream_view(), order_v.data(), num_vertices_, double{0.0}, double{1.0}, seed);
+
+  thrust::sort_by_key(
+    rmm::exec_policy(handle.get_stream()), order_v.begin(), order_v.end(), vertices_v.begin());
+
+  raft::copy(src_v.begin(), vertices_v.begin(), (num_vertices_ - 1), handle.get_stream());
+  raft::copy(dst_v.begin(), vertices_v.begin() + 1, (num_vertices_ - 1), handle.get_stream());
+
+  raft::copy(src_v.begin() + (num_vertices_ - 1),
+             vertices_v.begin() + 1,
+             (num_vertices_ - 1),
+             handle.get_stream());
+  raft::copy(dst_v.begin() + (num_vertices_ - 1),
+             vertices_v.begin(),
+             (num_vertices_ - 1),
+             handle.get_stream());
+
+  thrust::sequence(
+    rmm::exec_policy(handle.get_stream()), vertices_v.begin(), vertices_v.end(), vertex_t{0});
+
+  handle.get_stream_view().synchronize();
+
+  return cugraph::experimental::
+    create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      std::optional<std::tuple<vertex_t const*, vertex_t>>{
+        std::make_tuple(vertices_v.data(), static_cast<vertex_t>(vertices_v.size()))},
+      std::move(src_v),
+      std::move(dst_v),
+      std::nullopt,
+      cugraph::experimental::graph_properties_t{true, false},
+      false);
+}
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+LineGraph_Usecase::construct_graph(raft::handle_t const&, bool, bool) const;
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/components/wcc_graphs.hpp b/cpp/tests/components/wcc_graphs.hpp
new file mode 100644
index 00000000000..cea82cb95bc
--- /dev/null
+++ b/cpp/tests/components/wcc_graphs.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+#include <cugraph/experimental/graph.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace test {
+
+class LineGraph_Usecase {
+ public:
+  LineGraph_Usecase() = delete;
+
+  LineGraph_Usecase(size_t num_vertices) : num_vertices_(num_vertices) {}
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const;
+
+ private:
+  size_t num_vertices_{0};
+};
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/components/wcc_test.cpp b/cpp/tests/components/wcc_test.cpp
new file mode 100644
index 00000000000..9f6254d445f
--- /dev/null
+++ b/cpp/tests/components/wcc_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+
+#include <raft/cudart_utils.h>
+#include <rmm/device_uvector.hpp>
+
+struct WCC_Usecase {
+  bool validate_results{true};
+};
+
+template <typename input_usecase_t>
+class Tests_WCC : public ::testing::TestWithParam<std::tuple<WCC_Usecase, input_usecase_t>> {
+ public:
+  Tests_WCC() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  static std::vector<double> weakly_cc_time;
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(WCC_Usecase const& param, input_usecase_t const& input_usecase)
+  {
+    raft::handle_t handle{};
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+
+    std::cout << "calling construct_graph" << std::endl;
+
+    std::tie(graph, std::ignore) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, false, false);
+
+    std::cout << "back from construct_graph" << std::endl;
+
+    auto graph_view = graph.view();
+
+    rmm::device_uvector<vertex_t> component_labels_v(graph_view.get_number_of_vertices(),
+                                                     handle.get_stream());
+
+    // cugraph::weakly_connected_components(handle, graph_view, component_labels_v.begin());
+
+    // TODO: validate result
+  }
+};
+
+using Tests_WCC_File      = Tests_WCC<cugraph::test::File_Usecase>;
+using Tests_WCC_Rmat      = Tests_WCC<cugraph::test::Rmat_Usecase>;
+using Tests_WCC_PathGraph = Tests_WCC<cugraph::test::PathGraph_Usecase>;
+
+TEST_P(Tests_WCC_File, WCC)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+TEST_P(Tests_WCC_Rmat, WCC)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+TEST_P(Tests_WCC_PathGraph, WCC)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+
+// --gtest_filter=*simple_test*
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_WCC_File,
+  ::testing::Values(
+    std::make_tuple(WCC_Usecase{}, cugraph::test::File_Usecase("test/datasets/dolphins.mtx")),
+    std::make_tuple(WCC_Usecase{}, cugraph::test::File_Usecase("test/datasets/coPapersDBLP.mtx")),
+    std::make_tuple(WCC_Usecase{},
+                    cugraph::test::File_Usecase("test/datasets/coPapersCiteseer.mtx")),
+    std::make_tuple(WCC_Usecase{}, cugraph::test::File_Usecase("test/datasets/hollywood.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  path_graph_test,
+  Tests_WCC_PathGraph,
+  ::testing::Values(std::make_tuple(WCC_Usecase{},
+                                    cugraph::test::PathGraph_Usecase(
+                                      std::vector<std::tuple<size_t, size_t>>({{1000, 0}}))),
+                    std::make_tuple(WCC_Usecase{},
+                                    cugraph::test::PathGraph_Usecase(
+                                      std::vector<std::tuple<size_t, size_t>>({{100000, 0}})))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/weakly_connected_components_test.cpp b/cpp/tests/components/weakly_connected_components_test.cpp
new file mode 100644
index 00000000000..fe87e806687
--- /dev/null
+++ b/cpp/tests/components/weakly_connected_components_test.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+template <typename vertex_t, typename edge_t>
+void weakly_connected_components_reference(edge_t const* offsets,
+                                           vertex_t const* indices,
+                                           vertex_t* components,
+                                           vertex_t num_vertices)
+{
+  vertex_t depth{0};
+
+  std::fill(components,
+            components + num_vertices,
+            cugraph::experimental::invalid_component_id<vertex_t>::value);
+
+  vertex_t num_scanned{0};
+  while (true) {
+    auto it = std::find(components + num_scanned,
+                        components + num_vertices,
+                        cugraph::experimental::invalid_component_id<vertex_t>::value);
+    if (it == components + num_vertices) { break; }
+    num_scanned += static_cast<vertex_t>(std::distance(components + num_scanned, it));
+    auto source            = num_scanned;
+    *(components + source) = source;
+    std::vector<vertex_t> cur_frontier_rows{source};
+    std::vector<vertex_t> new_frontier_rows{};
+
+    while (cur_frontier_rows.size() > 0) {
+      for (auto const row : cur_frontier_rows) {
+        auto nbr_offset_first = *(offsets + row);
+        auto nbr_offset_last  = *(offsets + row + 1);
+        for (auto nbr_offset = nbr_offset_first; nbr_offset != nbr_offset_last; ++nbr_offset) {
+          auto nbr = *(indices + nbr_offset);
+          if (*(components + nbr) == cugraph::experimental::invalid_component_id<vertex_t>::value) {
+            *(components + nbr) = source;
+            new_frontier_rows.push_back(nbr);
+          }
+        }
+      }
+      std::swap(cur_frontier_rows, new_frontier_rows);
+      new_frontier_rows.clear();
+    }
+  }
+
+  return;
+}
+
+struct WeaklyConnectedComponents_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_WeaklyConnectedComponent
+  : public ::testing::TestWithParam<
+      std::tuple<WeaklyConnectedComponents_Usecase, input_usecase_t>> {
+ public:
+  Tests_WeaklyConnectedComponent() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(
+    WeaklyConnectedComponents_Usecase const& weakly_connected_components_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    constexpr bool renumber = true;
+
+    using weight_t = float;
+
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    auto [graph, d_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, false, renumber);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto graph_view = graph.view();
+    ASSERT_TRUE(graph_view.is_symmetric())
+      << "Weakly connected components works only on undirected (symmetric) graphs.";
+
+    rmm::device_uvector<vertex_t> d_components(graph_view.get_number_of_vertices(),
+                                               handle.get_stream());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    cugraph::experimental::weakly_connected_components(handle, graph_view, d_components.data());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "weakly_connected_components took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    if (weakly_connected_components_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+            handle, false, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      std::vector<vertex_t> h_reference_components(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      weakly_connected_components_reference(h_offsets.data(),
+                                            h_indices.data(),
+                                            h_reference_components.data(),
+                                            unrenumbered_graph_view.get_number_of_vertices());
+
+      std::vector<vertex_t> h_cugraph_components(graph_view.get_number_of_vertices());
+      if (renumber) {
+        rmm::device_uvector<vertex_t> d_unrenumbered_components(size_t{0},
+                                                                handle.get_stream_view());
+        std::tie(std::ignore, d_unrenumbered_components) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_components.data(),
+                                     (*d_renumber_map_labels).size());
+        raft::update_host(h_cugraph_components.data(),
+                          d_unrenumbered_components.data(),
+                          d_unrenumbered_components.size(),
+                          handle.get_stream());
+      } else {
+        raft::update_host(h_cugraph_components.data(),
+                          d_components.data(),
+                          d_components.size(),
+                          handle.get_stream());
+      }
+      handle.get_stream_view().synchronize();
+
+      std::unordered_map<vertex_t, vertex_t> cuda_to_reference_map{};
+      for (size_t i = 0; i < h_reference_components.size(); ++i) {
+        cuda_to_reference_map.insert({h_cugraph_components[i], h_reference_components[i]});
+      }
+      std::transform(
+        h_cugraph_components.begin(),
+        h_cugraph_components.end(),
+        h_cugraph_components.begin(),
+        [&cuda_to_reference_map](auto cugraph_c) { return cuda_to_reference_map[cugraph_c]; });
+
+      ASSERT_TRUE(std::equal(
+        h_reference_components.begin(), h_reference_components.end(), h_cugraph_components.begin()))
+        << "components do not match with the reference values.";
+    }
+  }
+};
+
+using Tests_WeaklyConnectedComponents_File =
+  Tests_WeaklyConnectedComponent<cugraph::test::File_Usecase>;
+using Tests_WeaklyConnectedComponents_Rmat =
+  Tests_WeaklyConnectedComponent<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_WeaklyConnectedComponents_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_WeaklyConnectedComponents_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_WeaklyConnectedComponents_File,
+  ::testing::Values(
+    // enable correctness checks
+    std::make_tuple(WeaklyConnectedComponents_Usecase{},
+                    cugraph::test::File_Usecase("test/datasets/karate.mtx")),
+    std::make_tuple(WeaklyConnectedComponents_Usecase{},
+                    cugraph::test::File_Usecase("test/datasets/polbooks.mtx")),
+    std::make_tuple(WeaklyConnectedComponents_Usecase{},
+                    cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_WeaklyConnectedComponents_Rmat,
+  ::testing::Values(
+    // enable correctness checks
+    std::make_tuple(WeaklyConnectedComponents_Usecase{},
+                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_WeaklyConnectedComponents_Rmat,
+  ::testing::Values(
+    // disable correctness checks
+    std::make_tuple(WeaklyConnectedComponents_Usecase{false},
+                    cugraph::test::Rmat_Usecase(20, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp
index 2498ca4f3f5..8510d6698ca 100644
--- a/cpp/tests/experimental/bfs_test.cpp
+++ b/cpp/tests/experimental/bfs_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include <utilities/high_res_clock.h>
 #include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
-#include <algorithms.hpp>
-#include <experimental/graph.hpp>
-#include <experimental/graph_view.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -28,13 +32,19 @@
 
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iterator>
 #include <limits>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 template <typename vertex_t, typename edge_t>
-void bfs_reference(edge_t* offsets,
-                   vertex_t* indices,
+void bfs_reference(edge_t const* offsets,
+                   vertex_t const* indices,
                    vertex_t* distances,
                    vertex_t* predecessors,
                    vertex_t num_vertices,
@@ -44,7 +54,9 @@ void bfs_reference(edge_t* offsets,
   vertex_t depth{0};
 
   std::fill(distances, distances + num_vertices, std::numeric_limits<vertex_t>::max());
-  std::fill(predecessors, predecessors + num_vertices, cugraph::invalid_vertex_id<vertex_t>::value);
+  std::fill(predecessors,
+            predecessors + num_vertices,
+            cugraph::experimental::invalid_vertex_id<vertex_t>::value);
 
   *(distances + source) = depth;
   std::vector<vertex_t> cur_frontier_rows{source};
@@ -72,21 +84,13 @@ void bfs_reference(edge_t* offsets,
   return;
 }
 
-typedef struct BFS_Usecase_t {
-  std::string graph_file_full_path{};
-  size_t source{false};
-
-  BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
-} BFS_Usecase;
+struct BFS_Usecase {
+  size_t source{0};
+  bool check_correctness{true};
+};
 
-class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
+template <typename input_usecase_t>
+class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_usecase_t>> {
  public:
   Tests_BFS() {}
   static void SetupTestCase() {}
@@ -96,109 +100,222 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
   virtual void TearDown() {}
 
   template <typename vertex_t, typename edge_t>
-  void run_current_test(BFS_Usecase const& configuration)
+  void run_current_test(BFS_Usecase const& bfs_usecase, input_usecase_t const& input_usecase)
   {
+    constexpr bool renumber = true;
+
     using weight_t = float;
 
     raft::handle_t handle{};
+    HighResClock hr_clock{};
 
-    auto graph =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
-        handle, configuration.graph_file_full_path, false);
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    auto [graph, d_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, true, renumber);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
     auto graph_view = graph.view();
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    ASSERT_TRUE(configuration.source >= 0 &&
-                configuration.source <= graph_view.get_number_of_vertices())
-      << "Starting sources should be >= 0 and"
-      << " less than the number of vertices in the graph.";
-
-    std::vector<vertex_t> h_reference_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
-
-    bfs_reference(h_offsets.data(),
-                  h_indices.data(),
-                  h_reference_distances.data(),
-                  h_reference_predecessors.data(),
-                  graph_view.get_number_of_vertices(),
-                  static_cast<vertex_t>(configuration.source),
-                  std::numeric_limits<vertex_t>::max());
+    ASSERT_TRUE(static_cast<vertex_t>(bfs_usecase.source) >= 0 &&
+                static_cast<vertex_t>(bfs_usecase.source) < graph_view.get_number_of_vertices())
+      << "Invalid starting source.";
 
     rmm::device_uvector<vertex_t> d_distances(graph_view.get_number_of_vertices(),
                                               handle.get_stream());
     rmm::device_uvector<vertex_t> d_predecessors(graph_view.get_number_of_vertices(),
                                                  handle.get_stream());
 
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
 
     cugraph::experimental::bfs(handle,
                                graph_view,
-                               d_distances.begin(),
-                               d_predecessors.begin(),
-                               static_cast<vertex_t>(configuration.source),
+                               d_distances.data(),
+                               d_predecessors.data(),
+                               static_cast<vertex_t>(bfs_usecase.source),
                                false,
-                               std::numeric_limits<vertex_t>::max(),
-                               false);
-
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    std::vector<vertex_t> h_cugraph_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
-
-    raft::update_host(
-      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
-    raft::update_host(h_cugraph_predecessors.data(),
-                      d_predecessors.data(),
-                      d_predecessors.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    ASSERT_TRUE(std::equal(
-      h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin()))
-      << "distances do not match with the reference values.";
-
-    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
-      auto i = std::distance(h_cugraph_predecessors.begin(), it);
-      if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
-        ASSERT_TRUE(h_reference_predecessors[i] == *it)
-          << "vertex reachability do not match with the reference.";
+                               std::numeric_limits<vertex_t>::max());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "BFS took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    if (bfs_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+            handle, true, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto unrenumbered_source = static_cast<vertex_t>(bfs_usecase.source);
+      if (renumber) {
+        std::vector<vertex_t> h_renumber_map_labels((*d_renumber_map_labels).size());
+        raft::update_host(h_renumber_map_labels.data(),
+                          (*d_renumber_map_labels).data(),
+                          (*d_renumber_map_labels).size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        unrenumbered_source = h_renumber_map_labels[bfs_usecase.source];
+      }
+
+      std::vector<vertex_t> h_reference_distances(unrenumbered_graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_reference_predecessors(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      bfs_reference(h_offsets.data(),
+                    h_indices.data(),
+                    h_reference_distances.data(),
+                    h_reference_predecessors.data(),
+                    unrenumbered_graph_view.get_number_of_vertices(),
+                    unrenumbered_source,
+                    std::numeric_limits<vertex_t>::max());
+
+      std::vector<vertex_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+      if (renumber) {
+        cugraph::experimental::unrenumber_local_int_vertices(handle,
+                                                             d_predecessors.data(),
+                                                             d_predecessors.size(),
+                                                             (*d_renumber_map_labels).data(),
+                                                             vertex_t{0},
+                                                             graph_view.get_number_of_vertices(),
+                                                             true);
+
+        rmm::device_uvector<vertex_t> d_unrenumbered_distances(size_t{0}, handle.get_stream());
+        std::tie(std::ignore, d_unrenumbered_distances) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_distances.data(),
+                                     (*d_renumber_map_labels).size());
+        rmm::device_uvector<vertex_t> d_unrenumbered_predecessors(size_t{0}, handle.get_stream());
+        std::tie(std::ignore, d_unrenumbered_predecessors) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_predecessors.data(),
+                                     (*d_renumber_map_labels).size());
+        raft::update_host(h_cugraph_distances.data(),
+                          d_unrenumbered_distances.data(),
+                          d_unrenumbered_distances.size(),
+                          handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_unrenumbered_predecessors.data(),
+                          d_unrenumbered_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
       } else {
-        ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i])
-          << "distance to this vertex != distance to the predecessor vertex + 1.";
-        bool found{false};
-        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
-          if (h_indices[j] == i) {
-            found = true;
-            break;
+        raft::update_host(
+          h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_predecessors.data(),
+                          d_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+      }
+
+      ASSERT_TRUE(std::equal(
+        h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin()))
+        << "distances do not match with the reference values.";
+
+      for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+        auto i = std::distance(h_cugraph_predecessors.begin(), it);
+        if (*it == cugraph::experimental::invalid_vertex_id<vertex_t>::value) {
+          ASSERT_TRUE(h_reference_predecessors[i] == *it)
+            << "vertex reachability does not match with the reference.";
+        } else {
+          ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i])
+            << "distance to this vertex != distance to the predecessor vertex + 1.";
+          bool found{false};
+          for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+            if (h_indices[j] == i) {
+              found = true;
+              break;
+            }
           }
+          ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
         }
-        ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
       }
     }
   }
 };
 
+using Tests_BFS_File = Tests_BFS<cugraph::test::File_Usecase>;
+using Tests_BFS_Rmat = Tests_BFS<cugraph::test::Rmat_Usecase>;
+
 // FIXME: add tests for type combinations
-TEST_P(Tests_BFS, CheckInt32Int32) { run_current_test<int32_t, int32_t>(GetParam()); }
-
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_BFS,
-                        ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
-                                          BFS_Usecase("test/datasets/polbooks.mtx", 0),
-                                          BFS_Usecase("test/datasets/netscience.mtx", 0),
-                                          BFS_Usecase("test/datasets/netscience.mtx", 100),
-                                          BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
-                                          BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
+TEST_P(Tests_BFS_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_BFS_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_BFS_File,
+  ::testing::Values(
+    // enable correctness checks
+    std::make_tuple(BFS_Usecase{0}, cugraph::test::File_Usecase("test/datasets/karate.mtx")),
+    std::make_tuple(BFS_Usecase{0}, cugraph::test::File_Usecase("test/datasets/polbooks.mtx")),
+    std::make_tuple(BFS_Usecase{0}, cugraph::test::File_Usecase("test/datasets/netscience.mtx")),
+    std::make_tuple(BFS_Usecase{100}, cugraph::test::File_Usecase("test/datasets/netscience.mtx")),
+    std::make_tuple(BFS_Usecase{1000}, cugraph::test::File_Usecase("test/datasets/wiki2003.mtx")),
+    std::make_tuple(BFS_Usecase{1000},
+                    cugraph::test::File_Usecase("test/datasets/wiki-Talk.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_BFS_Rmat,
+  ::testing::Values(
+    // enable correctness checks
+    std::make_tuple(BFS_Usecase{0},
+                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_BFS_Rmat,
+  ::testing::Values(
+    // disable correctness checks for large graphs
+    std::make_pair(BFS_Usecase{0, false},
+                   cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/coarsen_graph_test.cpp b/cpp/tests/experimental/coarsen_graph_test.cpp
new file mode 100644
index 00000000000..aaee198a4b0
--- /dev/null
+++ b/cpp/tests/experimental/coarsen_graph_test.cpp
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <map>
+#include <random>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void check_coarsened_graph_results(edge_t* org_offsets,
+                                   vertex_t* org_indices,
+                                   weight_t* org_weights,
+                                   vertex_t* org_labels,
+                                   edge_t* coarse_offsets,
+                                   vertex_t* coarse_indices,
+                                   weight_t* coarse_weights,
+                                   vertex_t* coarse_vertex_labels,
+                                   vertex_t num_org_vertices,
+                                   vertex_t num_coarse_vertices)
+{
+  ASSERT_TRUE(((org_weights == nullptr) && (coarse_weights == nullptr)) ||
+              ((org_weights != nullptr) && (coarse_weights != nullptr)));
+  ASSERT_TRUE(std::is_sorted(org_offsets, org_offsets + num_org_vertices));
+  ASSERT_TRUE(std::count_if(org_indices,
+                            org_indices + org_offsets[num_org_vertices],
+                            [num_org_vertices](auto nbr) {
+                              return !cugraph::experimental::is_valid_vertex(num_org_vertices, nbr);
+                            }) == 0);
+  ASSERT_TRUE(std::is_sorted(coarse_offsets, coarse_offsets + num_coarse_vertices));
+  ASSERT_TRUE(std::count_if(coarse_indices,
+                            coarse_indices + coarse_offsets[num_coarse_vertices],
+                            [num_coarse_vertices](auto nbr) {
+                              return !cugraph::experimental::is_valid_vertex(num_coarse_vertices,
+                                                                             nbr);
+                            }) == 0);
+  ASSERT_TRUE(num_coarse_vertices <= num_org_vertices);
+
+  std::vector<vertex_t> org_unique_labels(num_org_vertices);
+  std::iota(org_unique_labels.begin(), org_unique_labels.end(), vertex_t{0});
+  std::transform(org_unique_labels.begin(),
+                 org_unique_labels.end(),
+                 org_unique_labels.begin(),
+                 [org_labels](auto v) { return org_labels[v]; });
+  std::sort(org_unique_labels.begin(), org_unique_labels.end());
+  org_unique_labels.resize(std::distance(
+    org_unique_labels.begin(), std::unique(org_unique_labels.begin(), org_unique_labels.end())));
+
+  ASSERT_TRUE(org_unique_labels.size() == static_cast<size_t>(num_coarse_vertices));
+
+  {
+    std::vector<vertex_t> tmp_coarse_vertex_labels(coarse_vertex_labels,
+                                                   coarse_vertex_labels + num_coarse_vertices);
+    std::sort(tmp_coarse_vertex_labels.begin(), tmp_coarse_vertex_labels.end());
+    ASSERT_TRUE(std::unique(tmp_coarse_vertex_labels.begin(), tmp_coarse_vertex_labels.end()) ==
+                tmp_coarse_vertex_labels.end());
+    ASSERT_TRUE(std::equal(
+      org_unique_labels.begin(), org_unique_labels.end(), tmp_coarse_vertex_labels.begin()));
+  }
+
+  std::vector<std::tuple<vertex_t, vertex_t>> label_org_vertex_pairs(num_org_vertices);
+  for (vertex_t i = 0; i < num_org_vertices; ++i) {
+    label_org_vertex_pairs[i] = std::make_tuple(org_labels[i], i);
+  }
+  std::sort(label_org_vertex_pairs.begin(), label_org_vertex_pairs.end());
+
+  std::map<vertex_t, vertex_t> label_to_coarse_vertex_map{};
+  for (vertex_t i = 0; i < num_coarse_vertices; ++i) {
+    label_to_coarse_vertex_map[coarse_vertex_labels[i]] = i;
+  }
+
+  auto threshold_ratio = (org_weights == nullptr) ? weight_t{1.0} /* irrelevant */ : weight_t{1e-4};
+  auto threshold_magnitude =
+    (org_weights == nullptr)
+      ? weight_t{1.0} /* irrelevant */
+      : (std::accumulate(
+           coarse_weights, coarse_weights + coarse_offsets[num_coarse_vertices], weight_t{0.0}) /
+         static_cast<weight_t>(coarse_offsets[num_coarse_vertices])) *
+          threshold_ratio;
+
+  for (size_t i = 0; i < org_unique_labels.size(); ++i) {  // for each vertex in the coarse graph
+    auto lb = std::lower_bound(
+      label_org_vertex_pairs.begin(),
+      label_org_vertex_pairs.end(),
+      std::make_tuple(org_unique_labels[i],
+                      cugraph::experimental::invalid_vertex_id<vertex_t>::value /* dummy */),
+      [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
+    auto ub = std::upper_bound(
+      label_org_vertex_pairs.begin(),
+      label_org_vertex_pairs.end(),
+      std::make_tuple(org_unique_labels[i],
+                      cugraph::experimental::invalid_vertex_id<vertex_t>::value /* dummy */),
+      [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
+    auto count  = std::distance(lb, ub);
+    auto offset = std::distance(label_org_vertex_pairs.begin(), lb);
+    if (org_weights == nullptr) {
+      std::vector<vertex_t> coarse_nbrs0{};
+      std::for_each(
+        lb,
+        ub,
+        [org_offsets, org_indices, org_labels, &label_to_coarse_vertex_map, &coarse_nbrs0](auto t) {
+          auto org_vertex = std::get<1>(t);
+          std::vector<vertex_t> tmp_nbrs(org_offsets[org_vertex + 1] - org_offsets[org_vertex]);
+          std::transform(org_indices + org_offsets[org_vertex],
+                         org_indices + org_offsets[org_vertex + 1],
+                         tmp_nbrs.begin(),
+                         [org_labels, &label_to_coarse_vertex_map](auto nbr) {
+                           return label_to_coarse_vertex_map[org_labels[nbr]];
+                         });
+          coarse_nbrs0.insert(coarse_nbrs0.end(), tmp_nbrs.begin(), tmp_nbrs.end());
+        });
+      std::sort(coarse_nbrs0.begin(), coarse_nbrs0.end());
+      coarse_nbrs0.resize(
+        std::distance(coarse_nbrs0.begin(), std::unique(coarse_nbrs0.begin(), coarse_nbrs0.end())));
+
+      auto coarse_vertex = label_to_coarse_vertex_map[org_unique_labels[i]];
+      auto coarse_offset = coarse_offsets[coarse_vertex];
+      auto coarse_count  = coarse_offsets[coarse_vertex + 1] - coarse_offset;
+      std::vector<vertex_t> coarse_nbrs1(coarse_indices + coarse_offset,
+                                         coarse_indices + coarse_offset + coarse_count);
+      std::sort(coarse_nbrs1.begin(), coarse_nbrs1.end());
+
+      ASSERT_TRUE(coarse_nbrs0.size() == coarse_nbrs1.size());
+      ASSERT_TRUE(std::equal(coarse_nbrs0.begin(), coarse_nbrs0.end(), coarse_nbrs1.begin()));
+    } else {
+      std::vector<std::tuple<vertex_t, weight_t>> coarse_nbr_weight_pairs0{};
+      std::for_each(lb,
+                    ub,
+                    [org_offsets,
+                     org_indices,
+                     org_weights,
+                     org_labels,
+                     &label_to_coarse_vertex_map,
+                     &coarse_nbr_weight_pairs0](auto t) {
+                      auto org_vertex = std::get<1>(t);
+                      std::vector<std::tuple<vertex_t, weight_t>> tmp_pairs(
+                        org_offsets[org_vertex + 1] - org_offsets[org_vertex]);
+                      for (auto j = org_offsets[org_vertex]; j < org_offsets[org_vertex + 1]; ++j) {
+                        tmp_pairs[j - org_offsets[org_vertex]] = std::make_tuple(
+                          label_to_coarse_vertex_map[org_labels[org_indices[j]]], org_weights[j]);
+                      }
+                      coarse_nbr_weight_pairs0.insert(
+                        coarse_nbr_weight_pairs0.end(), tmp_pairs.begin(), tmp_pairs.end());
+                    });
+      std::sort(coarse_nbr_weight_pairs0.begin(), coarse_nbr_weight_pairs0.end());
+      // reduce by key
+      {
+        size_t run_start_idx = 0;
+        for (size_t j = 1; j < coarse_nbr_weight_pairs0.size(); ++j) {
+          auto& start = coarse_nbr_weight_pairs0[run_start_idx];
+          auto& cur   = coarse_nbr_weight_pairs0[j];
+          if (std::get<0>(start) == std::get<0>(cur)) {
+            std::get<1>(start) += std::get<1>(cur);
+            std::get<0>(cur) = cugraph::experimental::invalid_vertex_id<vertex_t>::value;
+          } else {
+            run_start_idx = j;
+          }
+        }
+        coarse_nbr_weight_pairs0.erase(
+          std::remove_if(coarse_nbr_weight_pairs0.begin(),
+                         coarse_nbr_weight_pairs0.end(),
+                         [](auto t) {
+                           return std::get<0>(t) ==
+                                  cugraph::experimental::invalid_vertex_id<vertex_t>::value;
+                         }),
+          coarse_nbr_weight_pairs0.end());
+      }
+
+      auto coarse_vertex = label_to_coarse_vertex_map[org_unique_labels[i]];
+      std::vector<std::tuple<vertex_t, weight_t>> coarse_nbr_weight_pairs1(
+        coarse_offsets[coarse_vertex + 1] - coarse_offsets[coarse_vertex]);
+      for (auto j = coarse_offsets[coarse_vertex]; j < coarse_offsets[coarse_vertex + 1]; ++j) {
+        coarse_nbr_weight_pairs1[j - coarse_offsets[coarse_vertex]] =
+          std::make_tuple(coarse_indices[j], coarse_weights[j]);
+      }
+      std::sort(coarse_nbr_weight_pairs1.begin(), coarse_nbr_weight_pairs1.end());
+
+      ASSERT_TRUE(coarse_nbr_weight_pairs0.size() == coarse_nbr_weight_pairs1.size());
+      ASSERT_TRUE(std::equal(
+        coarse_nbr_weight_pairs0.begin(),
+        coarse_nbr_weight_pairs0.end(),
+        coarse_nbr_weight_pairs1.begin(),
+        [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+          return std::get<0>(lhs) == std::get<0>(rhs)
+                   ? (std::abs(std::get<1>(lhs) - std::get<1>(rhs)) <=
+                      std::max(std::max(std::abs(std::get<1>(lhs)), std::abs(std::get<1>(rhs))) *
+                                 threshold_ratio,
+                               threshold_magnitude))
+                   : false;
+        }));
+    }
+  }
+
+  return;
+}
+
+typedef struct CoarsenGraph_Usecase_t {
+  std::string graph_file_full_path{};
+  double coarsen_ratio{0.0};
+  bool test_weighted{false};
+
+  CoarsenGraph_Usecase_t(std::string const& graph_file_path,
+                         double coarsen_ratio,
+                         bool test_weighted)
+    : coarsen_ratio(coarsen_ratio), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} CoarsenGraph_Usecase;
+
+class Tests_CoarsenGraph : public ::testing::TestWithParam<CoarsenGraph_Usecase> {
+ public:
+  Tests_CoarsenGraph() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(CoarsenGraph_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    // FIXME: remove this once we drop Pascal support
+    if (handle.get_device_properties().major < 7) {  // Pascal is not supported, skip testing
+      return;
+    }
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> graph(
+      handle);
+    std::tie(graph, std::ignore) = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+    auto graph_view = graph.view();
+
+    if (graph_view.get_number_of_vertices() == 0) { return; }
+
+    std::vector<vertex_t> h_labels(graph_view.get_number_of_vertices());
+    auto num_labels =
+      std::max(static_cast<vertex_t>(h_labels.size() * configuration.coarsen_ratio), vertex_t{1});
+
+    std::default_random_engine generator{};
+    std::uniform_int_distribution<vertex_t> distribution{0, num_labels - 1};
+
+    std::for_each(h_labels.begin(), h_labels.end(), [&distribution, &generator](auto& label) {
+      label = distribution(generator);
+    });
+
+    rmm::device_uvector<vertex_t> d_labels(h_labels.size(), handle.get_stream());
+    raft::update_device(d_labels.data(), h_labels.data(), h_labels.size(), handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::unique_ptr<
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
+      coarse_graph{};
+    rmm::device_uvector<vertex_t> coarse_vertices_to_labels(0, handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::tie(coarse_graph, coarse_vertices_to_labels) =
+      cugraph::experimental::coarsen_graph(handle, graph_view, d_labels.begin());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<edge_t> h_org_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_org_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_org_weights{};
+    raft::update_host(h_org_offsets.data(),
+                      graph_view.get_matrix_partition_view().get_offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_org_indices.data(),
+                      graph_view.get_matrix_partition_view().get_indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_org_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
+      raft::update_host(h_org_weights.data(),
+                        *(graph_view.get_matrix_partition_view().get_weights()),
+                        graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+
+    auto coarse_graph_view = coarse_graph->view();
+
+    std::vector<edge_t> h_coarse_offsets(coarse_graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_coarse_indices(coarse_graph_view.get_number_of_edges());
+    std::vector<weight_t> h_coarse_weights{};
+    raft::update_host(h_coarse_offsets.data(),
+                      coarse_graph_view.get_matrix_partition_view().get_offsets(),
+                      coarse_graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_coarse_indices.data(),
+                      coarse_graph_view.get_matrix_partition_view().get_indices(),
+                      coarse_graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_coarse_weights.resize(coarse_graph_view.get_number_of_edges());
+      raft::update_host(h_coarse_weights.data(),
+                        *(coarse_graph_view.get_matrix_partition_view().get_weights()),
+                        coarse_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+
+    std::vector<vertex_t> h_coarse_vertices_to_labels(coarse_vertices_to_labels.size());
+    raft::update_host(h_coarse_vertices_to_labels.data(),
+                      coarse_vertices_to_labels.data(),
+                      coarse_vertices_to_labels.size(),
+                      handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    check_coarsened_graph_results(h_org_offsets.data(),
+                                  h_org_indices.data(),
+                                  h_org_weights.data(),
+                                  h_labels.data(),
+                                  h_coarse_offsets.data(),
+                                  h_coarse_indices.data(),
+                                  h_coarse_weights.data(),
+                                  h_coarse_vertices_to_labels.data(),
+                                  graph_view.get_number_of_vertices(),
+                                  coarse_graph_view.get_number_of_vertices());
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_CoarsenGraph, CheckInt32Int32FloatTransposed)
+{
+  run_current_test<int32_t, int32_t, float, true>(GetParam());
+}
+
+TEST_P(Tests_CoarsenGraph, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_CoarsenGraph,
+  ::testing::Values(CoarsenGraph_Usecase("test/datasets/karate.mtx", 0.2, false),
+                    CoarsenGraph_Usecase("test/datasets/karate.mtx", 0.2, true),
+                    CoarsenGraph_Usecase("test/datasets/web-Google.mtx", 0.1, false),
+                    CoarsenGraph_Usecase("test/datasets/web-Google.mtx", 0.1, true),
+                    CoarsenGraph_Usecase("test/datasets/ljournal-2008.mtx", 0.1, false),
+                    CoarsenGraph_Usecase("test/datasets/ljournal-2008.mtx", 0.1, true),
+                    CoarsenGraph_Usecase("test/datasets/webbase-1M.mtx", 0.1, false),
+                    CoarsenGraph_Usecase("test/datasets/webbase-1M.mtx", 0.1, true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/degree_test.cpp b/cpp/tests/experimental/degree_test.cpp
new file mode 100644
index 00000000000..94134e3426f
--- /dev/null
+++ b/cpp/tests/experimental/degree_test.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+template <typename vertex_t, typename edge_t>
+void degree_reference(edge_t const* offsets,
+                      vertex_t const* indices,
+                      edge_t* degrees,
+                      vertex_t num_vertices,
+                      bool major)
+{
+  if (major) {
+    std::adjacent_difference(offsets + 1, offsets + num_vertices + 1, degrees);
+  } else {
+    std::fill(degrees, degrees + num_vertices, edge_t{0});
+    for (vertex_t i = 0; i < num_vertices; ++i) {
+      for (auto j = offsets[i]; j < offsets[i + 1]; ++j) {
+        auto nbr = indices[j];
+        ++degrees[nbr];
+      }
+    }
+  }
+
+  return;
+}
+
+typedef struct Degree_Usecase_t {
+  std::string graph_file_full_path{};
+
+  Degree_Usecase_t(std::string const& graph_file_path)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} Degree_Usecase;
+
+class Tests_Degree : public ::testing::TestWithParam<Degree_Usecase> {
+ public:
+  Tests_Degree() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(Degree_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> graph(
+      handle);
+    std::tie(graph, std::ignore) = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, configuration.graph_file_full_path, false, false);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    raft::update_host(h_offsets.data(),
+                      graph_view.get_matrix_partition_view().get_offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.get_matrix_partition_view().get_indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::vector<edge_t> h_reference_in_degrees(graph_view.get_number_of_vertices());
+    std::vector<edge_t> h_reference_out_degrees(graph_view.get_number_of_vertices());
+
+    degree_reference(h_offsets.data(),
+                     h_indices.data(),
+                     h_reference_in_degrees.data(),
+                     graph_view.get_number_of_vertices(),
+                     store_transposed);
+
+    degree_reference(h_offsets.data(),
+                     h_indices.data(),
+                     h_reference_out_degrees.data(),
+                     graph_view.get_number_of_vertices(),
+                     !store_transposed);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    auto d_in_degrees  = graph_view.compute_in_degrees(handle);
+    auto d_out_degrees = graph_view.compute_out_degrees(handle);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<edge_t> h_cugraph_in_degrees(graph_view.get_number_of_vertices());
+    std::vector<edge_t> h_cugraph_out_degrees(graph_view.get_number_of_vertices());
+
+    raft::update_host(
+      h_cugraph_in_degrees.data(), d_in_degrees.data(), d_in_degrees.size(), handle.get_stream());
+    raft::update_host(h_cugraph_out_degrees.data(),
+                      d_out_degrees.data(),
+                      d_out_degrees.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(std::equal(
+      h_reference_in_degrees.begin(), h_reference_in_degrees.end(), h_cugraph_in_degrees.begin()))
+      << "In-degree values do not match with the reference values.";
+    ASSERT_TRUE(std::equal(h_reference_out_degrees.begin(),
+                           h_reference_out_degrees.end(),
+                           h_cugraph_out_degrees.begin()))
+      << "Out-degree values do not match with the reference values.";
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_Degree, CheckInt32Int32FloatTransposed)
+{
+  run_current_test<int32_t, int32_t, float, true>(GetParam());
+}
+
+TEST_P(Tests_Degree, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_Degree,
+                         ::testing::Values(Degree_Usecase("test/datasets/karate.mtx"),
+                                           Degree_Usecase("test/datasets/web-Google.mtx"),
+                                           Degree_Usecase("test/datasets/ljournal-2008.mtx"),
+                                           Degree_Usecase("test/datasets/webbase-1M.mtx")));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/generate_rmat_test.cpp b/cpp/tests/experimental/generate_rmat_test.cpp
new file mode 100644
index 00000000000..7c2dbb3911a
--- /dev/null
+++ b/cpp/tests/experimental/generate_rmat_test.cpp
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <tuple>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/graph_generators.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+// this function assumes that vertex IDs are not scrambled
+template <typename vertex_t>
+void validate_rmat_distribution(
+  std::tuple<vertex_t, vertex_t>* edges,
+  size_t num_edges,
+  vertex_t src_first,
+  vertex_t src_last,
+  vertex_t dst_first,
+  vertex_t dst_last,
+  double a,
+  double b,
+  double c,
+  bool clip_and_flip,
+  size_t min_edges /* stop recursion if # edges < min_edges */,
+  double error_tolerance /* (computed a|b|c - input a|b|c) shoud be smaller than error_tolerance*/)
+{
+  // we cannot expect the ratios of the edges in the four quadrants of the graph adjacency matrix to
+  // converge close to a, b, c, d if num_edges is not large enough.
+  if (num_edges < min_edges) { return; }
+
+  auto src_threshold = (src_first + src_last) / 2;
+  auto dst_threshold = (dst_first + dst_last) / 2;
+
+  auto a_plus_b_last = std::partition(edges, edges + num_edges, [src_threshold](auto edge) {
+    return std::get<0>(edge) < src_threshold;
+  });
+  auto a_last        = std::partition(
+    edges, a_plus_b_last, [dst_threshold](auto edge) { return std::get<1>(edge) < dst_threshold; });
+  auto c_last = std::partition(a_plus_b_last, edges + num_edges, [dst_threshold](auto edge) {
+    return std::get<1>(edge) < dst_threshold;
+  });
+
+  ASSERT_TRUE(std::abs((double)std::distance(edges, a_last) / num_edges - a) < error_tolerance)
+    << "# edges=" << num_edges << " computed a=" << (double)std::distance(edges, a_last) / num_edges
+    << " iput a=" << a << " error tolerance=" << error_tolerance << ".";
+  if (clip_and_flip && (src_first == dst_first) &&
+      (src_last == dst_last)) {  // if clip_and_flip and in the diagonal
+    ASSERT_TRUE(std::distance(a_last, a_plus_b_last) == 0);
+    ASSERT_TRUE(std::abs((double)std::distance(a_plus_b_last, c_last) / num_edges - (b + c)) <
+                error_tolerance)
+      << "# edges=" << num_edges
+      << " computed c=" << (double)std::distance(a_plus_b_last, c_last) / num_edges
+      << " iput (b + c)=" << (b + c) << " error tolerance=" << error_tolerance << ".";
+  } else {
+    ASSERT_TRUE(std::abs((double)std::distance(a_last, a_plus_b_last) / num_edges - b) <
+                error_tolerance)
+      << "# edges=" << num_edges
+      << " computed b=" << (double)std::distance(a_last, a_plus_b_last) / num_edges
+      << " iput b=" << b << " error tolerance=" << error_tolerance << ".";
+    ASSERT_TRUE(std::abs((double)std::distance(a_plus_b_last, c_last) / num_edges - c) <
+                error_tolerance)
+      << "# edges=" << num_edges
+      << " computed c=" << (double)std::distance(a_plus_b_last, c_last) / num_edges
+      << " iput c=" << c << " error tolerance=" << error_tolerance << ".";
+  }
+
+  validate_rmat_distribution(edges,
+                             std::distance(edges, a_last),
+                             src_first,
+                             src_threshold,
+                             dst_first,
+                             dst_threshold,
+                             a,
+                             b,
+                             c,
+                             clip_and_flip,
+                             min_edges,
+                             error_tolerance);
+  validate_rmat_distribution(a_last,
+                             std::distance(a_last, a_plus_b_last),
+                             src_first,
+                             (src_first + src_last) / 2,
+                             dst_threshold,
+                             dst_last,
+                             a,
+                             b,
+                             c,
+                             clip_and_flip,
+                             min_edges,
+                             error_tolerance);
+  validate_rmat_distribution(a_plus_b_last,
+                             std::distance(a_plus_b_last, c_last),
+                             src_threshold,
+                             src_last,
+                             dst_first,
+                             dst_threshold,
+                             a,
+                             b,
+                             c,
+                             clip_and_flip,
+                             min_edges,
+                             error_tolerance);
+  validate_rmat_distribution(c_last,
+                             std::distance(c_last, edges + num_edges),
+                             src_threshold,
+                             src_last,
+                             dst_threshold,
+                             dst_last,
+                             a,
+                             b,
+                             c,
+                             clip_and_flip,
+                             min_edges,
+                             error_tolerance);
+
+  return;
+}
+
+typedef struct GenerateRmat_Usecase_t {
+  size_t scale{0};
+  size_t edge_factor{0};
+  double a{0.0};
+  double b{0.0};
+  double c{0.0};
+  bool clip_and_flip{false};
+
+  GenerateRmat_Usecase_t(
+    size_t scale, size_t edge_factor, double a, double b, double c, bool clip_and_flip)
+    : scale(scale), edge_factor(edge_factor), a(a), b(b), c(c), clip_and_flip(clip_and_flip){};
+} GenerateRmat_Usecase;
+
+class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase> {
+ public:
+  Tests_GenerateRmat() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t>
+  void run_current_test(GenerateRmat_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    auto num_vertices = static_cast<vertex_t>(size_t{1} << configuration.scale);
+    std::vector<size_t> no_scramble_out_degrees(num_vertices, 0);
+    std::vector<size_t> no_scramble_in_degrees(num_vertices, 0);
+    std::vector<size_t> scramble_out_degrees(num_vertices, 0);
+    std::vector<size_t> scramble_in_degrees(num_vertices, 0);
+    for (size_t scramble = 0; scramble < 2; ++scramble) {
+      rmm::device_uvector<vertex_t> d_srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_dsts(0, handle.get_stream());
+
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+      std::tie(d_srcs, d_dsts) = cugraph::generate_rmat_edgelist<vertex_t>(
+        handle,
+        configuration.scale,
+        (size_t{1} << configuration.scale) * configuration.edge_factor,
+        configuration.a,
+        configuration.b,
+        configuration.c,
+        uint64_t{0},
+        configuration.clip_and_flip);
+      // static_cast<bool>(scramble));
+
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+      std::vector<vertex_t> h_cugraph_srcs(d_srcs.size());
+      std::vector<vertex_t> h_cugraph_dsts(d_dsts.size());
+
+      raft::update_host(h_cugraph_srcs.data(), d_srcs.data(), d_srcs.size(), handle.get_stream());
+      raft::update_host(h_cugraph_dsts.data(), d_dsts.data(), d_dsts.size(), handle.get_stream());
+      CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+      ASSERT_TRUE(
+        (h_cugraph_srcs.size() == (size_t{1} << configuration.scale) * configuration.edge_factor) &&
+        (h_cugraph_dsts.size() == (size_t{1} << configuration.scale) * configuration.edge_factor))
+        << "Returned an invalid number of R-mat graph edges.";
+      ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(),
+                                h_cugraph_srcs.end(),
+                                [num_vertices = static_cast<vertex_t>(
+                                   size_t{1} << configuration.scale)](auto v) {
+                                  return !cugraph::experimental::is_valid_vertex(num_vertices, v);
+                                }) == 0)
+        << "Returned R-mat graph edges have invalid source vertex IDs.";
+      ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(),
+                                h_cugraph_dsts.end(),
+                                [num_vertices = static_cast<vertex_t>(
+                                   size_t{1} << configuration.scale)](auto v) {
+                                  return !cugraph::experimental::is_valid_vertex(num_vertices, v);
+                                }) == 0)
+        << "Returned R-mat graph edges have invalid destination vertex IDs.";
+
+      if (!scramble) {
+        if (configuration.clip_and_flip) {
+          for (size_t i = 0; i < h_cugraph_srcs.size(); ++i) {
+            ASSERT_TRUE(h_cugraph_srcs[i] >= h_cugraph_dsts[i]);
+          }
+        }
+
+        std::vector<std::tuple<vertex_t, vertex_t>> h_cugraph_edges(h_cugraph_srcs.size());
+        for (size_t i = 0; i < h_cugraph_srcs.size(); ++i) {
+          h_cugraph_edges[i] = std::make_tuple(h_cugraph_srcs[i], h_cugraph_dsts[i]);
+        }
+
+        validate_rmat_distribution(h_cugraph_edges.data(),
+                                   h_cugraph_edges.size(),
+                                   vertex_t{0},
+                                   num_vertices,
+                                   vertex_t{0},
+                                   num_vertices,
+                                   configuration.a,
+                                   configuration.b,
+                                   configuration.c,
+                                   configuration.clip_and_flip,
+                                   size_t{100000},
+                                   0.01);
+      }
+
+      if (scramble) {
+        std::for_each(h_cugraph_srcs.begin(),
+                      h_cugraph_srcs.end(),
+                      [&scramble_out_degrees](auto src) { scramble_out_degrees[src]++; });
+        std::for_each(h_cugraph_dsts.begin(),
+                      h_cugraph_dsts.end(),
+                      [&scramble_in_degrees](auto dst) { scramble_in_degrees[dst]++; });
+        std::sort(scramble_out_degrees.begin(), scramble_out_degrees.end());
+        std::sort(scramble_in_degrees.begin(), scramble_in_degrees.end());
+      } else {
+        std::for_each(h_cugraph_srcs.begin(),
+                      h_cugraph_srcs.end(),
+                      [&no_scramble_out_degrees](auto src) { no_scramble_out_degrees[src]++; });
+        std::for_each(h_cugraph_dsts.begin(),
+                      h_cugraph_dsts.end(),
+                      [&no_scramble_in_degrees](auto dst) { no_scramble_in_degrees[dst]++; });
+        std::sort(no_scramble_out_degrees.begin(), no_scramble_out_degrees.end());
+        std::sort(no_scramble_in_degrees.begin(), no_scramble_in_degrees.end());
+      }
+    }
+
+    // this relies on the fact that the edge generator is deterministic.
+    // ideally, we should test that the two graphs are isomorphic, but this is NP hard; insted, we
+    // just check out-degree & in-degree distributions
+    ASSERT_TRUE(std::equal(no_scramble_out_degrees.begin(),
+                           no_scramble_out_degrees.end(),
+                           scramble_out_degrees.begin()));
+    ASSERT_TRUE(std::equal(
+      no_scramble_in_degrees.begin(), no_scramble_in_degrees.end(), scramble_in_degrees.begin()));
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_GenerateRmat, CheckInt32) { run_current_test<int32_t>(GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_GenerateRmat,
+                         ::testing::Values(GenerateRmat_Usecase(20, 16, 0.57, 0.19, 0.19, true),
+                                           GenerateRmat_Usecase(20, 16, 0.57, 0.19, 0.19, false),
+                                           GenerateRmat_Usecase(20, 16, 0.45, 0.22, 0.22, true),
+                                           GenerateRmat_Usecase(20, 16, 0.45, 0.22, 0.22, false)));
+typedef struct GenerateRmats_Usecase_t {
+  size_t n_edgelists{0};
+  size_t min_scale{0};
+  size_t max_scale{0};
+  size_t edge_factor{0};
+  cugraph::generator_distribution_t component_distribution;
+  cugraph::generator_distribution_t edge_distribution;
+
+  GenerateRmats_Usecase_t(size_t n_edgelists,
+                          size_t min_scale,
+                          size_t max_scale,
+                          size_t edge_factor,
+                          cugraph::generator_distribution_t component_distribution,
+                          cugraph::generator_distribution_t edge_distribution)
+    : n_edgelists(n_edgelists),
+      min_scale(min_scale),
+      max_scale(max_scale),
+      component_distribution(component_distribution),
+      edge_distribution(edge_distribution),
+      edge_factor(edge_factor){};
+} GenerateRmats_Usecase;
+class Tests_GenerateRmats : public ::testing::TestWithParam<GenerateRmats_Usecase> {
+ public:
+  Tests_GenerateRmats() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t>
+  void run_current_test(GenerateRmats_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    auto outputs = cugraph::generate_rmat_edgelists<vertex_t>(handle,
+                                                              configuration.n_edgelists,
+                                                              configuration.min_scale,
+                                                              configuration.max_scale,
+                                                              configuration.edge_factor,
+                                                              configuration.component_distribution,
+                                                              configuration.edge_distribution,
+                                                              uint64_t{0});
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    ASSERT_EQ(configuration.n_edgelists, outputs.size());
+    for (auto i = outputs.begin(); i != outputs.end(); ++i) {
+      ASSERT_EQ(std::get<0>(*i).size(), std::get<1>(*i).size());
+      ASSERT_TRUE((configuration.min_scale * configuration.edge_factor) <= std::get<0>(*i).size());
+      ASSERT_TRUE((configuration.max_scale * configuration.edge_factor) >= std::get<0>(*i).size());
+    }
+  }
+};
+TEST_P(Tests_GenerateRmats, CheckInt32) { run_current_test<int32_t>(GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_GenerateRmats,
+  ::testing::Values(GenerateRmats_Usecase(8,
+                                          1,
+                                          16,
+                                          32,
+                                          cugraph::generator_distribution_t::UNIFORM,
+                                          cugraph::generator_distribution_t::UNIFORM),
+                    GenerateRmats_Usecase(8,
+                                          1,
+                                          16,
+                                          32,
+                                          cugraph::generator_distribution_t::UNIFORM,
+                                          cugraph::generator_distribution_t::POWER_LAW),
+                    GenerateRmats_Usecase(8,
+                                          3,
+                                          16,
+                                          32,
+                                          cugraph::generator_distribution_t::POWER_LAW,
+                                          cugraph::generator_distribution_t::UNIFORM),
+                    GenerateRmats_Usecase(8,
+                                          3,
+                                          16,
+                                          32,
+                                          cugraph::generator_distribution_t::POWER_LAW,
+                                          cugraph::generator_distribution_t::POWER_LAW)));
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/graph_test.cpp b/cpp/tests/experimental/graph_test.cpp
index b80de68f95c..dff841a5b73 100644
--- a/cpp/tests/experimental/graph_test.cpp
+++ b/cpp/tests/experimental/graph_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
-#include <experimental/graph.hpp>
-#include <experimental/graph_view.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -34,16 +34,18 @@
 #include <vector>
 
 template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
-std::tuple<std::vector<edge_t>, std::vector<vertex_t>, std::vector<weight_t>> graph_reference(
-  vertex_t const* p_src_vertices,
-  vertex_t const* p_dst_vertices,
-  weight_t const* p_edge_weights,
-  vertex_t number_of_vertices,
-  edge_t number_of_edges)
+std::tuple<std::vector<edge_t>, std::vector<vertex_t>, std::optional<std::vector<weight_t>>>
+graph_reference(vertex_t const* p_src_vertices,
+                vertex_t const* p_dst_vertices,
+                std::optional<weight_t const*> p_edge_weights,
+                vertex_t number_of_vertices,
+                edge_t number_of_edges)
 {
   std::vector<edge_t> offsets(number_of_vertices + 1, edge_t{0});
   std::vector<vertex_t> indices(number_of_edges, vertex_t{0});
-  std::vector<weight_t> weights(p_edge_weights != nullptr ? number_of_edges : 0, weight_t{0.0});
+  auto weights = p_edge_weights
+                   ? std::make_optional<std::vector<weight_t>>(number_of_edges, weight_t{0.0})
+                   : std::nullopt;
 
   for (size_t i = 0; i < number_of_edges; ++i) {
     auto major = store_transposed ? p_dst_vertices[i] : p_src_vertices[i];
@@ -58,7 +60,7 @@ std::tuple<std::vector<edge_t>, std::vector<vertex_t>, std::vector<weight_t>> gr
     auto degree          = offsets[major + 1] - start;
     auto idx             = indices[start + degree - 1]++;
     indices[start + idx] = minor;
-    if (p_edge_weights != nullptr) { weights[start + idx] = p_edge_weights[i]; }
+    if (p_edge_weights) { (*weights)[start + idx] = (*p_edge_weights)[i]; }
   }
 
   return std::make_tuple(std::move(offsets), std::move(indices), std::move(weights));
@@ -91,43 +93,38 @@ class Tests_Graph : public ::testing::TestWithParam<Graph_Usecase> {
   template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
   void run_current_test(Graph_Usecase const& configuration)
   {
-    auto mm_graph =
-      cugraph::test::read_edgelist_from_matrix_market_file<vertex_t, edge_t, weight_t>(
-        configuration.graph_file_full_path);
-    edge_t number_of_edges = static_cast<edge_t>(mm_graph.h_rows.size());
+    raft::handle_t handle{};
 
-    std::vector<edge_t> h_reference_offsets{};
-    std::vector<vertex_t> h_reference_indices{};
-    std::vector<weight_t> h_reference_weights{};
+    auto [d_rows, d_cols, d_weights, number_of_vertices, is_symmetric] =
+      cugraph::test::read_edgelist_from_matrix_market_file<vertex_t, weight_t>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted);
+    edge_t number_of_edges = static_cast<edge_t>(d_rows.size());
+
+    std::vector<vertex_t> h_rows(number_of_edges);
+    std::vector<vertex_t> h_cols(number_of_edges);
+    auto h_weights =
+      d_weights ? std::make_optional<std::vector<weight_t>>(number_of_edges) : std::nullopt;
+
+    raft::update_host(h_rows.data(), d_rows.data(), number_of_edges, handle.get_stream());
+    raft::update_host(h_cols.data(), d_cols.data(), number_of_edges, handle.get_stream());
+    if (h_weights) {
+      raft::update_host(
+        (*h_weights).data(), (*d_weights).data(), number_of_edges, handle.get_stream());
+    }
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
-    std::tie(h_reference_offsets, h_reference_indices, h_reference_weights) =
+    auto [h_reference_offsets, h_reference_indices, h_reference_weights] =
       graph_reference<store_transposed>(
-        mm_graph.h_rows.data(),
-        mm_graph.h_cols.data(),
-        configuration.test_weighted ? mm_graph.h_weights.data() : nullptr,
-        mm_graph.number_of_vertices,
+        h_rows.data(),
+        h_cols.data(),
+        h_weights ? std::optional<weight_t const*>{(*h_weights).data()} : std::nullopt,
+        number_of_vertices,
         number_of_edges);
 
-    raft::handle_t handle{};
-
-    rmm::device_uvector<vertex_t> d_rows(number_of_edges, handle.get_stream());
-    rmm::device_uvector<vertex_t> d_cols(number_of_edges, handle.get_stream());
-    rmm::device_uvector<weight_t> d_weights(configuration.test_weighted ? number_of_edges : 0,
-                                            handle.get_stream());
-
-    raft::update_device(
-      d_rows.data(), mm_graph.h_rows.data(), number_of_edges, handle.get_stream());
-    raft::update_device(
-      d_cols.data(), mm_graph.h_cols.data(), number_of_edges, handle.get_stream());
-    if (configuration.test_weighted) {
-      raft::update_device(
-        d_weights.data(), mm_graph.h_weights.data(), number_of_edges, handle.get_stream());
-    }
-
     cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
       d_rows.data(),
       d_cols.data(),
-      configuration.test_weighted ? d_weights.data() : nullptr,
+      d_weights ? std::optional<weight_t const*>{(*d_weights).data()} : std::nullopt,
       number_of_edges};
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -136,34 +133,35 @@ class Tests_Graph : public ::testing::TestWithParam<Graph_Usecase> {
       cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>(
         handle,
         edgelist,
-        mm_graph.number_of_vertices,
-        cugraph::experimental::graph_properties_t{mm_graph.is_symmetric, false},
-        false,
+        number_of_vertices,
+        cugraph::experimental::graph_properties_t{is_symmetric, false},
+        std::nullopt,
         true);
 
     auto graph_view = graph.view();
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
-    ASSERT_EQ(graph_view.get_number_of_vertices(), mm_graph.number_of_vertices);
+    ASSERT_EQ(graph_view.get_number_of_vertices(), number_of_vertices);
     ASSERT_EQ(graph_view.get_number_of_edges(), number_of_edges);
 
     std::vector<edge_t> h_cugraph_offsets(graph_view.get_number_of_vertices() + 1);
     std::vector<vertex_t> h_cugraph_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_cugraph_weights(
-      configuration.test_weighted ? graph_view.get_number_of_edges() : 0);
+    auto h_cugraph_weights =
+      graph.is_weighted() ? std::optional<std::vector<weight_t>>(graph_view.get_number_of_edges())
+                          : std::nullopt;
 
     raft::update_host(h_cugraph_offsets.data(),
-                      graph_view.offsets(),
+                      graph_view.get_matrix_partition_view().get_offsets(),
                       graph_view.get_number_of_vertices() + 1,
                       handle.get_stream());
     raft::update_host(h_cugraph_indices.data(),
-                      graph_view.indices(),
+                      graph_view.get_matrix_partition_view().get_indices(),
                       graph_view.get_number_of_edges(),
                       handle.get_stream());
-    if (configuration.test_weighted) {
-      raft::update_host(h_cugraph_weights.data(),
-                        graph_view.weights(),
+    if (h_cugraph_weights) {
+      raft::update_host((*h_cugraph_weights).data(),
+                        *(graph_view.get_matrix_partition_view().get_weights()),
                         graph_view.get_number_of_edges(),
                         handle.get_stream());
     }
@@ -173,8 +171,11 @@ class Tests_Graph : public ::testing::TestWithParam<Graph_Usecase> {
     ASSERT_TRUE(
       std::equal(h_reference_offsets.begin(), h_reference_offsets.end(), h_cugraph_offsets.begin()))
       << "Graph compressed sparse format offsets do not match with the reference values.";
-    ASSERT_EQ(h_reference_weights.size(), h_cugraph_weights.size());
-    for (vertex_t i = 0; i < mm_graph.number_of_vertices; ++i) {
+    ASSERT_EQ(h_reference_weights.has_value(), h_cugraph_weights.has_value());
+    if (h_reference_weights) {
+      ASSERT_EQ((*h_reference_weights).size(), (*h_cugraph_weights).size());
+    }
+    for (vertex_t i = 0; i < number_of_vertices; ++i) {
       auto start  = h_reference_offsets[i];
       auto degree = h_reference_offsets[i + 1] - start;
       if (configuration.test_weighted) {
@@ -182,9 +183,9 @@ class Tests_Graph : public ::testing::TestWithParam<Graph_Usecase> {
         std::vector<std::tuple<vertex_t, weight_t>> cugraph_pairs(degree);
         for (edge_t j = 0; j < degree; ++j) {
           reference_pairs[j] =
-            std::make_tuple(h_reference_indices[start + j], h_reference_weights[start + j]);
+            std::make_tuple(h_reference_indices[start + j], (*h_reference_weights)[start + j]);
           cugraph_pairs[j] =
-            std::make_tuple(h_cugraph_indices[start + j], h_cugraph_weights[start + j]);
+            std::make_tuple(h_cugraph_indices[start + j], (*h_cugraph_weights)[start + j]);
         }
         std::sort(reference_pairs.begin(), reference_pairs.end());
         std::sort(cugraph_pairs.begin(), cugraph_pairs.end());
@@ -228,15 +229,15 @@ TEST_P(Tests_Graph, CheckStoreTransposedTrue)
   run_current_test<int64_t, int64_t, double, true>(GetParam());
 }
 
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_Graph,
-                        ::testing::Values(Graph_Usecase("test/datasets/karate.mtx", false),
-                                          Graph_Usecase("test/datasets/karate.mtx", true),
-                                          Graph_Usecase("test/datasets/web-Google.mtx", false),
-                                          Graph_Usecase("test/datasets/web-Google.mtx", true),
-                                          Graph_Usecase("test/datasets/ljournal-2008.mtx", false),
-                                          Graph_Usecase("test/datasets/ljournal-2008.mtx", true),
-                                          Graph_Usecase("test/datasets/webbase-1M.mtx", false),
-                                          Graph_Usecase("test/datasets/webbase-1M.mtx", true)));
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_Graph,
+                         ::testing::Values(Graph_Usecase("test/datasets/karate.mtx", false),
+                                           Graph_Usecase("test/datasets/karate.mtx", true),
+                                           Graph_Usecase("test/datasets/web-Google.mtx", false),
+                                           Graph_Usecase("test/datasets/web-Google.mtx", true),
+                                           Graph_Usecase("test/datasets/ljournal-2008.mtx", false),
+                                           Graph_Usecase("test/datasets/ljournal-2008.mtx", true),
+                                           Graph_Usecase("test/datasets/webbase-1M.mtx", false),
+                                           Graph_Usecase("test/datasets/webbase-1M.mtx", true)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/induced_subgraph_test.cpp b/cpp/tests/experimental/induced_subgraph_test.cpp
new file mode 100644
index 00000000000..3e2dbf4fe3c
--- /dev/null
+++ b/cpp/tests/experimental/induced_subgraph_test.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <tuple>
+#include <vector>
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<std::vector<vertex_t>,
+           std::vector<vertex_t>,
+           std::optional<std::vector<weight_t>>,
+           std::vector<size_t>>
+extract_induced_subgraph_reference(edge_t const* offsets,
+                                   vertex_t const* indices,
+                                   std::optional<weight_t const*> weights,
+                                   size_t const* subgraph_offsets,
+                                   vertex_t const* subgraph_vertices,
+                                   vertex_t num_vertices,
+                                   size_t num_subgraphs)
+{
+  std::vector<vertex_t> edgelist_majors{};
+  std::vector<vertex_t> edgelist_minors{};
+  auto edgelist_weights = weights ? std::make_optional<std::vector<weight_t>>(0) : std::nullopt;
+  std::vector<size_t> subgraph_edge_offsets{0};
+
+  for (size_t i = 0; i < num_subgraphs; ++i) {
+    std::for_each(subgraph_vertices + subgraph_offsets[i],
+                  subgraph_vertices + subgraph_offsets[i + 1],
+                  [offsets,
+                   indices,
+                   weights,
+                   subgraph_vertices,
+                   subgraph_offsets,
+                   &edgelist_majors,
+                   &edgelist_minors,
+                   &edgelist_weights,
+                   i](auto v) {
+                    auto first = offsets[v];
+                    auto last  = offsets[v + 1];
+                    for (auto j = first; j < last; ++j) {
+                      if (std::binary_search(subgraph_vertices + subgraph_offsets[i],
+                                             subgraph_vertices + subgraph_offsets[i + 1],
+                                             indices[j])) {
+                        edgelist_majors.push_back(v);
+                        edgelist_minors.push_back(indices[j]);
+                        if (weights) { (*edgelist_weights).push_back((*weights)[j]); }
+                      }
+                    }
+                  });
+    subgraph_edge_offsets.push_back(edgelist_majors.size());
+  }
+
+  return std::make_tuple(edgelist_majors, edgelist_minors, edgelist_weights, subgraph_edge_offsets);
+}
+
+typedef struct InducedSubgraph_Usecase_t {
+  std::string graph_file_full_path{};
+  std::vector<size_t> subgraph_sizes{};
+  bool test_weighted{false};
+
+  InducedSubgraph_Usecase_t(std::string const& graph_file_path,
+                            std::vector<size_t> const& subgraph_sizes,
+                            bool test_weighted)
+    : subgraph_sizes(subgraph_sizes), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} InducedSubgraph_Usecase;
+
+class Tests_InducedSubgraph : public ::testing::TestWithParam<InducedSubgraph_Usecase> {
+ public:
+  Tests_InducedSubgraph() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(InducedSubgraph_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> graph(
+      handle);
+    std::tie(graph, std::ignore) = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    auto h_weights = graph_view.is_weighted() ? std::make_optional<std::vector<weight_t>>(
+                                                  graph_view.get_number_of_edges(), weight_t{0.0})
+                                              : std::nullopt;
+    raft::update_host(h_offsets.data(),
+                      graph_view.get_matrix_partition_view().get_offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.get_matrix_partition_view().get_indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (h_weights) {
+      raft::update_host((*h_weights).data(),
+                        *(graph_view.get_matrix_partition_view().get_weights()),
+                        graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::vector<size_t> h_subgraph_offsets(configuration.subgraph_sizes.size() + 1, 0);
+    std::partial_sum(configuration.subgraph_sizes.begin(),
+                     configuration.subgraph_sizes.end(),
+                     h_subgraph_offsets.begin() + 1);
+    std::vector<vertex_t> h_subgraph_vertices(
+      h_subgraph_offsets.back(), cugraph::experimental::invalid_vertex_id<vertex_t>::value);
+    std::default_random_engine generator{};
+    std::uniform_int_distribution<vertex_t> distribution{0,
+                                                         graph_view.get_number_of_vertices() - 1};
+
+    for (size_t i = 0; i < configuration.subgraph_sizes.size(); ++i) {
+      auto start = h_subgraph_offsets[i];
+      auto last  = h_subgraph_offsets[i + 1];
+      ASSERT_TRUE(last - start <= graph_view.get_number_of_vertices()) << "Invalid subgraph size.";
+      // this is inefficient if last - start << graph_view.get_number_of_vertices() but this is for
+      // the test puspose only and the time & memory cost is only linear to
+      // graph_view.get_number_of_vertices(), so this may not matter.
+      std::vector<vertex_t> vertices(graph_view.get_number_of_vertices());
+      std::iota(vertices.begin(), vertices.end(), vertex_t{0});
+      std::random_shuffle(vertices.begin(), vertices.end());
+      std::copy(
+        vertices.begin(), vertices.begin() + (last - start), h_subgraph_vertices.begin() + start);
+      std::sort(h_subgraph_vertices.begin() + start, h_subgraph_vertices.begin() + last);
+    }
+
+    rmm::device_uvector<size_t> d_subgraph_offsets(h_subgraph_offsets.size(), handle.get_stream());
+    rmm::device_uvector<vertex_t> d_subgraph_vertices(h_subgraph_vertices.size(),
+                                                      handle.get_stream());
+    raft::update_device(d_subgraph_offsets.data(),
+                        h_subgraph_offsets.data(),
+                        h_subgraph_offsets.size(),
+                        handle.get_stream());
+    raft::update_device(d_subgraph_vertices.data(),
+                        h_subgraph_vertices.data(),
+                        h_subgraph_vertices.size(),
+                        handle.get_stream());
+
+    auto [h_reference_subgraph_edgelist_majors,
+          h_reference_subgraph_edgelist_minors,
+          h_reference_subgraph_edgelist_weights,
+          h_reference_subgraph_edge_offsets] =
+      extract_induced_subgraph_reference(
+        h_offsets.data(),
+        h_indices.data(),
+        h_weights ? std::optional<weight_t const*>{(*h_weights).data()} : std::nullopt,
+        h_subgraph_offsets.data(),
+        h_subgraph_vertices.data(),
+        graph_view.get_number_of_vertices(),
+        configuration.subgraph_sizes.size());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    // FIXME: turn-off do_expensive_check once verified.
+    auto [d_subgraph_edgelist_majors,
+          d_subgraph_edgelist_minors,
+          d_subgraph_edgelist_weights,
+          d_subgraph_edge_offsets] =
+      cugraph::experimental::extract_induced_subgraphs(handle,
+                                                       graph_view,
+                                                       d_subgraph_offsets.data(),
+                                                       d_subgraph_vertices.data(),
+                                                       configuration.subgraph_sizes.size(),
+                                                       true);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<vertex_t> h_cugraph_subgraph_edgelist_majors(d_subgraph_edgelist_majors.size());
+    std::vector<vertex_t> h_cugraph_subgraph_edgelist_minors(d_subgraph_edgelist_minors.size());
+    auto h_cugraph_subgraph_edgelist_weights =
+      d_subgraph_edgelist_weights
+        ? std::make_optional<std::vector<weight_t>>((*d_subgraph_edgelist_weights).size())
+        : std::nullopt;
+    std::vector<size_t> h_cugraph_subgraph_edge_offsets(d_subgraph_edge_offsets.size());
+
+    raft::update_host(h_cugraph_subgraph_edgelist_majors.data(),
+                      d_subgraph_edgelist_majors.data(),
+                      d_subgraph_edgelist_majors.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_subgraph_edgelist_minors.data(),
+                      d_subgraph_edgelist_minors.data(),
+                      d_subgraph_edgelist_minors.size(),
+                      handle.get_stream());
+    if (d_subgraph_edgelist_weights) {
+      raft::update_host((*h_cugraph_subgraph_edgelist_weights).data(),
+                        (*d_subgraph_edgelist_weights).data(),
+                        (*d_subgraph_edgelist_weights).size(),
+                        handle.get_stream());
+    }
+    raft::update_host(h_cugraph_subgraph_edge_offsets.data(),
+                      d_subgraph_edge_offsets.data(),
+                      d_subgraph_edge_offsets.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(h_reference_subgraph_edge_offsets.size() == h_cugraph_subgraph_edge_offsets.size())
+      << "Returned subgraph edge offset vector has an invalid size.";
+    ASSERT_TRUE(std::equal(h_reference_subgraph_edge_offsets.begin(),
+                           h_reference_subgraph_edge_offsets.end(),
+                           h_cugraph_subgraph_edge_offsets.begin()))
+      << "Returned subgraph edge offset values do not match with the reference values.";
+    ASSERT_TRUE(h_reference_subgraph_edgelist_weights.has_value() == configuration.test_weighted);
+    ASSERT_TRUE(h_cugraph_subgraph_edgelist_weights.has_value() ==
+                h_reference_subgraph_edgelist_weights.has_value());
+
+    for (size_t i = 0; i < configuration.subgraph_sizes.size(); ++i) {
+      auto start = h_reference_subgraph_edge_offsets[i];
+      auto last  = h_reference_subgraph_edge_offsets[i + 1];
+      if (configuration.test_weighted) {
+        std::vector<std::tuple<vertex_t, vertex_t, weight_t>> reference_tuples(last - start);
+        std::vector<std::tuple<vertex_t, vertex_t, weight_t>> cugraph_tuples(last - start);
+        for (auto j = start; j < last; ++j) {
+          reference_tuples[j - start] =
+            std::make_tuple(h_reference_subgraph_edgelist_majors[j],
+                            h_reference_subgraph_edgelist_minors[j],
+                            (*h_reference_subgraph_edgelist_weights)[j]);
+          cugraph_tuples[j - start] = std::make_tuple(h_cugraph_subgraph_edgelist_majors[j],
+                                                      h_cugraph_subgraph_edgelist_minors[j],
+                                                      (*h_cugraph_subgraph_edgelist_weights)[j]);
+        }
+        ASSERT_TRUE(
+          std::equal(reference_tuples.begin(), reference_tuples.end(), cugraph_tuples.begin()))
+          << "Extracted subgraph edges do not match with the edges extracted by the reference "
+             "implementation.";
+      } else {
+        std::vector<std::tuple<vertex_t, vertex_t>> reference_tuples(last - start);
+        std::vector<std::tuple<vertex_t, vertex_t>> cugraph_tuples(last - start);
+        for (auto j = start; j < last; ++j) {
+          reference_tuples[j - start] = std::make_tuple(h_reference_subgraph_edgelist_majors[j],
+                                                        h_reference_subgraph_edgelist_minors[j]);
+          cugraph_tuples[j - start]   = std::make_tuple(h_cugraph_subgraph_edgelist_majors[j],
+                                                      h_cugraph_subgraph_edgelist_minors[j]);
+        }
+        ASSERT_TRUE(
+          std::equal(reference_tuples.begin(), reference_tuples.end(), cugraph_tuples.begin()))
+          << "Extracted subgraph edges do not match with the edges extracted by the reference "
+             "implementation.";
+      }
+    }
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_InducedSubgraph, CheckInt32Int32FloatTransposed)
+{
+  run_current_test<int32_t, int32_t, float, true>(GetParam());
+}
+
+TEST_P(Tests_InducedSubgraph, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_InducedSubgraph,
+  ::testing::Values(
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{0}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{1}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{10}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{34}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{10, 0, 5}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{9, 3, 10}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{5, 12, 13}, true),
+    InducedSubgraph_Usecase("test/datasets/web-Google.mtx",
+                            std::vector<size_t>{250, 130, 15},
+                            false),
+    InducedSubgraph_Usecase("test/datasets/web-Google.mtx",
+                            std::vector<size_t>{125, 300, 70},
+                            true),
+    InducedSubgraph_Usecase("test/datasets/ljournal-2008.mtx",
+                            std::vector<size_t>{300, 20, 400},
+                            false),
+    InducedSubgraph_Usecase("test/datasets/ljournal-2008.mtx",
+                            std::vector<size_t>{9130, 1200, 300},
+                            true),
+    InducedSubgraph_Usecase("test/datasets/webbase-1M.mtx", std::vector<size_t>{700}, false),
+    InducedSubgraph_Usecase("test/datasets/webbase-1M.mtx", std::vector<size_t>{500}, true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp
index c2ac4340319..0a4fba9acd1 100644
--- a/cpp/tests/experimental/katz_centrality_test.cpp
+++ b/cpp/tests/experimental/katz_centrality_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include <utilities/high_res_clock.h>
 #include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
-#include <algorithms.hpp>
-#include <experimental/graph.hpp>
-#include <experimental/graph_view.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -34,11 +38,16 @@
 #include <numeric>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void katz_centrality_reference(edge_t* offsets,
-                               vertex_t* indices,
-                               weight_t* weights,
-                               result_t* betas,
+void katz_centrality_reference(edge_t const* offsets,
+                               vertex_t const* indices,
+                               std::optional<weight_t const*> weights,
+                               result_t const* betas,
                                result_t* katz_centralities,
                                vertex_t num_vertices,
                                result_t alpha,
@@ -62,7 +71,7 @@ void katz_centrality_reference(edge_t* offsets,
       katz_centralities[i] = betas != nullptr ? betas[i] : beta;
       for (auto j = *(offsets + i); j < *(offsets + i + 1); ++j) {
         auto nbr = indices[j];
-        auto w   = weights != nullptr ? weights[j] : result_t{1.0};
+        auto w   = weights ? (*weights)[j] : result_t{1.0};
         katz_centralities[i] += alpha * old_katz_centralities[nbr] * w;
       }
     }
@@ -88,22 +97,14 @@ void katz_centrality_reference(edge_t* offsets,
   return;
 }
 
-typedef struct KatzCentrality_Usecase_t {
-  std::string graph_file_full_path{};
+struct KatzCentrality_Usecase {
   bool test_weighted{false};
+  bool check_correctness{true};
+};
 
-  KatzCentrality_Usecase_t(std::string const& graph_file_path, bool test_weighted)
-    : test_weighted(test_weighted)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
-} KatzCentrality_Usecase;
-
-class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usecase> {
+template <typename input_usecase_t>
+class Tests_KatzCentrality
+  : public ::testing::TestWithParam<std::tuple<KatzCentrality_Usecase, input_usecase_t>> {
  public:
   Tests_KatzCentrality() {}
   static void SetupTestCase() {}
@@ -113,118 +114,199 @@ class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usec
   virtual void TearDown() {}
 
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-  void run_current_test(KatzCentrality_Usecase const& configuration)
+  void run_current_test(KatzCentrality_Usecase const& katz_usecase,
+                        input_usecase_t const& input_usecase)
   {
+    constexpr bool renumber = true;
+
     raft::handle_t handle{};
+    HighResClock hr_clock{};
 
-    auto graph =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true>(
-        handle, configuration.graph_file_full_path, configuration.test_weighted);
-    auto graph_view = graph.view();
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_weights{};
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    if (graph_view.is_weighted()) {
-      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
-      raft::update_host(h_weights.data(),
-                        graph_view.weights(),
-                        graph_view.get_number_of_edges(),
-                        handle.get_stream());
+    auto [graph, d_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, false>(
+        handle, katz_usecase.test_weighted, renumber);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
     }
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
-    std::vector<result_t> h_reference_katz_centralities(graph_view.get_number_of_vertices());
+    auto graph_view = graph.view();
 
-    std::vector<edge_t> tmps(h_offsets.size());
-    std::adjacent_difference(h_offsets.begin(), h_offsets.end(), tmps.begin());
-    auto max_it = std::max_element(tmps.begin(), tmps.end());
+    auto degrees = graph_view.compute_in_degrees(handle);
+    std::vector<edge_t> h_degrees(degrees.size());
+    raft::update_host(h_degrees.data(), degrees.data(), degrees.size(), handle.get_stream());
+    handle.get_stream_view().synchronize();
+    auto max_it = std::max_element(h_degrees.begin(), h_degrees.end());
 
     result_t const alpha = result_t{1.0} / static_cast<result_t>(*max_it + 1);
     result_t constexpr beta{1.0};
     result_t constexpr epsilon{1e-6};
 
-    katz_centrality_reference(
-      h_offsets.data(),
-      h_indices.data(),
-      h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
-      static_cast<result_t*>(nullptr),
-      h_reference_katz_centralities.data(),
-      graph_view.get_number_of_vertices(),
-      alpha,
-      beta,
-      epsilon,
-      std::numeric_limits<size_t>::max(),
-      false,
-      true);
-
     rmm::device_uvector<result_t> d_katz_centralities(graph_view.get_number_of_vertices(),
                                                       handle.get_stream());
 
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
 
     cugraph::experimental::katz_centrality(handle,
                                            graph_view,
                                            static_cast<result_t*>(nullptr),
-                                           d_katz_centralities.begin(),
+                                           d_katz_centralities.data(),
                                            alpha,
                                            beta,
                                            epsilon,
                                            std::numeric_limits<size_t>::max(),
                                            false,
-                                           true,
-                                           false);
-
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    std::vector<result_t> h_cugraph_katz_centralities(graph_view.get_number_of_vertices());
-
-    raft::update_host(h_cugraph_katz_centralities.data(),
-                      d_katz_centralities.data(),
-                      d_katz_centralities.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    auto threshold_ratio = 1e-3;
-    auto threshold_magnitude =
-      (epsilon / static_cast<result_t>(graph_view.get_number_of_vertices())) * threshold_ratio;
-    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      auto diff = std::abs(lhs - rhs);
-      return (diff < std::max(lhs, rhs) * threshold_ratio) || (diff < threshold_magnitude);
-    };
-
-    ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(),
-                           h_reference_katz_centralities.end(),
-                           h_cugraph_katz_centralities.begin(),
-                           nearly_equal))
-      << "Katz centrality values do not match with the reference values.";
+                                           true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "Katz Centrality took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    if (katz_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, false>(
+            handle, katz_usecase.test_weighted, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      auto h_weights = unrenumbered_graph_view.is_weighted()
+                         ? std::make_optional<std::vector<weight_t>>(
+                             unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0})
+                         : std::nullopt;
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      if (h_weights) {
+        raft::update_host((*h_weights).data(),
+                          *(unrenumbered_graph_view.get_matrix_partition_view().get_weights()),
+                          unrenumbered_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      std::vector<result_t> h_reference_katz_centralities(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      katz_centrality_reference(
+        h_offsets.data(),
+        h_indices.data(),
+        h_weights ? std::optional<weight_t const*>{(*h_weights).data()} : std::nullopt,
+        static_cast<result_t*>(nullptr),
+        h_reference_katz_centralities.data(),
+        unrenumbered_graph_view.get_number_of_vertices(),
+        alpha,
+        beta,
+        epsilon,
+        std::numeric_limits<size_t>::max(),
+        false,
+        true);
+
+      std::vector<result_t> h_cugraph_katz_centralities(graph_view.get_number_of_vertices());
+      if (renumber) {
+        rmm::device_uvector<result_t> d_unrenumbered_katz_centralities(size_t{0},
+                                                                       handle.get_stream());
+        std::tie(std::ignore, d_unrenumbered_katz_centralities) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_katz_centralities.data(),
+                                     (*d_renumber_map_labels).size());
+        raft::update_host(h_cugraph_katz_centralities.data(),
+                          d_unrenumbered_katz_centralities.data(),
+                          d_unrenumbered_katz_centralities.size(),
+                          handle.get_stream());
+      } else {
+        raft::update_host(h_cugraph_katz_centralities.data(),
+                          d_katz_centralities.data(),
+                          d_katz_centralities.size(),
+                          handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      auto threshold_ratio = 1e-3;
+      auto threshold_magnitude =
+        (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
+        threshold_ratio;  // skip comparison for low Katz Centrality verties (lowly ranked vertices)
+      auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
+
+      ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(),
+                             h_reference_katz_centralities.end(),
+                             h_cugraph_katz_centralities.begin(),
+                             nearly_equal))
+        << "Katz centrality values do not match with the reference values.";
+    }
   }
 };
 
+using Tests_KatzCentrality_File = Tests_KatzCentrality<cugraph::test::File_Usecase>;
+using Tests_KatzCentrality_Rmat = Tests_KatzCentrality<cugraph::test::Rmat_Usecase>;
+
 // FIXME: add tests for type combinations
-TEST_P(Tests_KatzCentrality, CheckInt32Int32FloatFloat)
+TEST_P(Tests_KatzCentrality_File, CheckInt32Int32FloatFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_KatzCentrality_Rmat, CheckInt32Int32FloatFloat)
 {
-  run_current_test<int32_t, int32_t, float, float>(GetParam());
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
 }
 
-INSTANTIATE_TEST_CASE_P(
-  simple_test,
-  Tests_KatzCentrality,
-  ::testing::Values(KatzCentrality_Usecase("test/datasets/karate.mtx", false),
-                    KatzCentrality_Usecase("test/datasets/karate.mtx", true),
-                    KatzCentrality_Usecase("test/datasets/web-Google.mtx", false),
-                    KatzCentrality_Usecase("test/datasets/web-Google.mtx", true),
-                    KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", false),
-                    KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", true),
-                    KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false),
-                    KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true)));
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_KatzCentrality_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(KatzCentrality_Usecase{false}, KatzCentrality_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_KatzCentrality_Rmat,
+                         // enable correctness checks
+                         ::testing::Combine(::testing::Values(KatzCentrality_Usecase{false},
+                                                              KatzCentrality_Usecase{true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_large_test,
+                         Tests_KatzCentrality_Rmat,
+                         // disable correctness checks for large graphs
+                         ::testing::Combine(::testing::Values(KatzCentrality_Usecase{false, false},
+                                                              KatzCentrality_Usecase{true, false}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/louvain_test.cu b/cpp/tests/experimental/louvain_test.cu
deleted file mode 100644
index ce8fb55b1d8..00000000000
--- a/cpp/tests/experimental/louvain_test.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governin_from_mtxg permissions and
- * limitations under the License.
- */
-
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-#include <experimental/graph.hpp>
-#else
-#include <experimental/louvain.cuh>
-#endif
-
-#include <algorithms.hpp>
-
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <iterator>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-typedef struct Louvain_Usecase_t {
-  std::string graph_file_full_path{};
-  bool test_weighted{false};
-
-  Louvain_Usecase_t(std::string const& graph_file_path, bool test_weighted)
-    : test_weighted(test_weighted)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
-} Louvain_Usecase;
-
-class Tests_Louvain : public ::testing::TestWithParam<Louvain_Usecase> {
- public:
-  Tests_Louvain() {}
-  static void SetupTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-  void run_current_test(Louvain_Usecase const& configuration)
-  {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-    CUGRAPH_FAIL("Louvain not supported on Pascal and older architectures");
-#else
-    raft::handle_t handle{};
-
-    std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl;
-
-    auto graph =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
-        handle, configuration.graph_file_full_path, configuration.test_weighted);
-
-    auto graph_view = graph.view();
-
-    louvain(graph_view);
-#endif
-  }
-
-  template <typename graph_t>
-  void louvain(graph_t const& graph_view)
-  {
-    using vertex_t = typename graph_t::vertex_type;
-    using weight_t = typename graph_t::weight_type;
-
-    raft::handle_t handle{};
-
-    rmm::device_vector<vertex_t> clustering_v(graph_view.get_number_of_local_vertices());
-    size_t level;
-    weight_t modularity;
-
-    std::tie(level, modularity) =
-      cugraph::louvain(handle, graph_view, clustering_v.data().get(), size_t{100}, weight_t{1});
-
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    std::cout << "level = " << level << std::endl;
-    std::cout << "modularity = " << modularity << std::endl;
-  }
-};
-
-// FIXME: add tests for type combinations
-TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, float>(GetParam());
-}
-
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_Louvain,
-                        ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true)
-#if 0
-			,
-                                          Louvain_Usecase("test/datasets/web-Google.mtx", true),
-                                          Louvain_Usecase("test/datasets/ljournal-2008.mtx", true),
-                                          Louvain_Usecase("test/datasets/webbase-1M.mtx", true)
-#endif
-                                            ));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/mg_bfs_test.cpp b/cpp/tests/experimental/mg_bfs_test.cpp
new file mode 100644
index 00000000000..5257c728ce4
--- /dev/null
+++ b/cpp/tests/experimental/mg_bfs_test.cpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+struct BFS_Usecase {
+  size_t source{0};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGBFS() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running BFS on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(BFS_Usecase const& bfs_usecase, input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        handle, false, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    ASSERT_TRUE(static_cast<vertex_t>(bfs_usecase.source) >= 0 &&
+                static_cast<vertex_t>(bfs_usecase.source) < mg_graph_view.get_number_of_vertices())
+      << "Invalid starting source.";
+
+    // 3. run MG BFS
+
+    rmm::device_uvector<vertex_t> d_mg_distances(mg_graph_view.get_number_of_local_vertices(),
+                                                 handle.get_stream());
+    rmm::device_uvector<vertex_t> d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(),
+                                                    handle.get_stream());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    cugraph::experimental::bfs(handle,
+                               mg_graph_view,
+                               d_mg_distances.data(),
+                               d_mg_predecessors.data(),
+                               static_cast<vertex_t>(bfs_usecase.source),
+                               false,
+                               std::numeric_limits<vertex_t>::max());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG BFS took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    // 4. compare SG & MG results
+
+    if (bfs_usecase.check_correctness) {
+      // 4-1. aggregate MG results
+
+      auto d_mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
+        handle, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
+      auto d_mg_aggregate_distances =
+        cugraph::test::device_gatherv(handle, d_mg_distances.data(), d_mg_distances.size());
+      auto d_mg_aggregate_predecessors =
+        cugraph::test::device_gatherv(handle, d_mg_predecessors.data(), d_mg_predecessors.size());
+
+      if (handle.get_comms().get_rank() == int{0}) {
+        // 4-2. unrenumbr MG results
+
+        cugraph::experimental::unrenumber_int_vertices<vertex_t, false>(
+          handle,
+          d_mg_aggregate_predecessors.data(),
+          d_mg_aggregate_predecessors.size(),
+          d_mg_aggregate_renumber_map_labels.data(),
+          vertex_t{0},
+          mg_graph_view.get_number_of_vertices(),
+          std::vector<vertex_t>{mg_graph_view.get_number_of_vertices()});
+
+        std::tie(std::ignore, d_mg_aggregate_distances) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_distances.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+        std::tie(std::ignore, d_mg_aggregate_predecessors) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_predecessors.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+
+        // 4-3. create SG graph
+
+        cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
+        std::tie(sg_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+            handle, false, false);
+
+        auto sg_graph_view = sg_graph.view();
+
+        ASSERT_TRUE(mg_graph_view.get_number_of_vertices() ==
+                    sg_graph_view.get_number_of_vertices());
+
+        // 4-4. run SG BFS
+
+        rmm::device_uvector<vertex_t> d_sg_distances(sg_graph_view.get_number_of_vertices(),
+                                                     handle.get_stream());
+        rmm::device_uvector<vertex_t> d_sg_predecessors(
+          sg_graph_view.get_number_of_local_vertices(), handle.get_stream());
+
+        vertex_t unrenumbered_source{};
+        raft::update_host(&unrenumbered_source,
+                          d_mg_aggregate_renumber_map_labels.data() + bfs_usecase.source,
+                          size_t{1},
+                          handle.get_stream());
+        handle.get_stream_view().synchronize();
+
+        cugraph::experimental::bfs(handle,
+                                   sg_graph_view,
+                                   d_sg_distances.data(),
+                                   d_sg_predecessors.data(),
+                                   unrenumbered_source,
+                                   false,
+                                   std::numeric_limits<vertex_t>::max());
+        // 4-5. compare
+
+        std::vector<edge_t> h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1);
+        std::vector<vertex_t> h_sg_indices(sg_graph_view.get_number_of_edges());
+        raft::update_host(h_sg_offsets.data(),
+                          sg_graph_view.get_matrix_partition_view().get_offsets(),
+                          sg_graph_view.get_number_of_vertices() + 1,
+                          handle.get_stream());
+        raft::update_host(h_sg_indices.data(),
+                          sg_graph_view.get_matrix_partition_view().get_indices(),
+                          sg_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+
+        std::vector<vertex_t> h_mg_aggregate_distances(mg_graph_view.get_number_of_vertices());
+        std::vector<vertex_t> h_mg_aggregate_predecessors(mg_graph_view.get_number_of_vertices());
+
+        raft::update_host(h_mg_aggregate_distances.data(),
+                          d_mg_aggregate_distances.data(),
+                          d_mg_aggregate_distances.size(),
+                          handle.get_stream());
+        raft::update_host(h_mg_aggregate_predecessors.data(),
+                          d_mg_aggregate_predecessors.data(),
+                          d_mg_aggregate_predecessors.size(),
+                          handle.get_stream());
+
+        std::vector<vertex_t> h_sg_distances(sg_graph_view.get_number_of_vertices());
+        std::vector<vertex_t> h_sg_predecessors(sg_graph_view.get_number_of_vertices());
+
+        raft::update_host(
+          h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream());
+        raft::update_host(h_sg_predecessors.data(),
+                          d_sg_predecessors.data(),
+                          d_sg_predecessors.size(),
+                          handle.get_stream());
+        handle.get_stream_view().synchronize();
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_distances.begin(),
+                               h_mg_aggregate_distances.end(),
+                               h_sg_distances.begin()));
+        for (size_t i = 0; i < h_mg_aggregate_predecessors.size(); ++i) {
+          if (h_mg_aggregate_predecessors[i] ==
+              cugraph::experimental::invalid_vertex_id<vertex_t>::value) {
+            ASSERT_TRUE(h_sg_predecessors[i] == h_mg_aggregate_predecessors[i])
+              << "vertex reachability does not match with the SG result.";
+          } else {
+            ASSERT_TRUE(h_sg_distances[h_mg_aggregate_predecessors[i]] + 1 == h_sg_distances[i])
+              << "distances to this vertex != distances to the predecessor vertex + 1.";
+            bool found{false};
+            for (auto j = h_sg_offsets[h_mg_aggregate_predecessors[i]];
+                 j < h_sg_offsets[h_mg_aggregate_predecessors[i] + 1];
+                 ++j) {
+              if (h_sg_indices[j] == i) {
+                found = true;
+                break;
+              }
+            }
+            ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
+          }
+        }
+      }
+    }
+  }
+};
+
+using Tests_MGBFS_File = Tests_MGBFS<cugraph::test::File_Usecase>;
+using Tests_MGBFS_Rmat = Tests_MGBFS<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGBFS_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGBFS_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGBFS_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(BFS_Usecase{0}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGBFS_Rmat,
+                         ::testing::Values(
+                           // enable correctness checks
+                           std::make_tuple(BFS_Usecase{0},
+                                           cugraph::test::Rmat_Usecase(
+                                             10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_large_test,
+                         Tests_MGBFS_Rmat,
+                         ::testing::Values(
+                           // disable correctness checks for large graphs
+                           std::make_tuple(BFS_Usecase{0, false},
+                                           cugraph::test::Rmat_Usecase(
+                                             20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/mg_katz_centrality_test.cpp b/cpp/tests/experimental/mg_katz_centrality_test.cpp
new file mode 100644
index 00000000000..9071701324e
--- /dev/null
+++ b/cpp/tests/experimental/mg_katz_centrality_test.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+struct KatzCentrality_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGKatzCentrality
+  : public ::testing::TestWithParam<std::tuple<KatzCentrality_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGKatzCentrality() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running Katz Centrality on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(KatzCentrality_Usecase const& katz_usecase,
+                        input_usecase_t const& input_usecase)
+  {
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, true>(
+        handle, katz_usecase.test_weighted, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 3. compute max in-degree
+
+    auto max_in_degree = mg_graph_view.compute_max_in_degree(handle);
+
+    // 4. run MG Katz Centrality
+
+    result_t const alpha = result_t{1.0} / static_cast<result_t>(max_in_degree + 1);
+    result_t constexpr beta{1.0};
+    result_t constexpr epsilon{1e-6};
+
+    rmm::device_uvector<result_t> d_mg_katz_centralities(
+      mg_graph_view.get_number_of_local_vertices(), handle.get_stream());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    cugraph::experimental::katz_centrality(handle,
+                                           mg_graph_view,
+                                           static_cast<result_t*>(nullptr),
+                                           d_mg_katz_centralities.data(),
+                                           alpha,
+                                           beta,
+                                           epsilon,
+                                           std::numeric_limits<size_t>::max(),
+                                           false);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG Katz Centrality took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    // 5. copmare SG & MG results
+
+    if (katz_usecase.check_correctness) {
+      // 5-1. aggregate MG results
+
+      auto d_mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
+        handle, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
+      auto d_mg_aggregate_katz_centralities = cugraph::test::device_gatherv(
+        handle, d_mg_katz_centralities.data(), d_mg_katz_centralities.size());
+
+      if (handle.get_comms().get_rank() == int{0}) {
+        // 5-2. unrenumbr MG results
+
+        std::tie(std::ignore, d_mg_aggregate_katz_centralities) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_katz_centralities.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+
+        // 5-3. create SG graph
+
+        cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> sg_graph(handle);
+        std::tie(sg_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, false>(
+            handle, katz_usecase.test_weighted, false);
+
+        auto sg_graph_view = sg_graph.view();
+
+        ASSERT_TRUE(mg_graph_view.get_number_of_vertices() ==
+                    sg_graph_view.get_number_of_vertices());
+
+        // 5-4. run SG Katz Centrality
+
+        rmm::device_uvector<result_t> d_sg_katz_centralities(sg_graph_view.get_number_of_vertices(),
+                                                             handle.get_stream());
+
+        cugraph::experimental::katz_centrality(
+          handle,
+          sg_graph_view,
+          static_cast<result_t*>(nullptr),
+          d_sg_katz_centralities.data(),
+          alpha,
+          beta,
+          epsilon,
+          std::numeric_limits<size_t>::max(),  // max_iterations
+          false);
+
+        // 5-5. compare
+
+        std::vector<result_t> h_mg_aggregate_katz_centralities(
+          mg_graph_view.get_number_of_vertices());
+        raft::update_host(h_mg_aggregate_katz_centralities.data(),
+                          d_mg_aggregate_katz_centralities.data(),
+                          d_mg_aggregate_katz_centralities.size(),
+                          handle.get_stream());
+
+        std::vector<result_t> h_sg_katz_centralities(sg_graph_view.get_number_of_vertices());
+        raft::update_host(h_sg_katz_centralities.data(),
+                          d_sg_katz_centralities.data(),
+                          d_sg_katz_centralities.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        auto threshold_ratio = 1e-3;
+        auto threshold_magnitude =
+          (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
+          threshold_ratio;  // skip comparison for low KatzCentrality verties (lowly ranked
+                            // vertices)
+        auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+          return std::abs(lhs - rhs) <
+                 std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+        };
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_katz_centralities.begin(),
+                               h_mg_aggregate_katz_centralities.end(),
+                               h_sg_katz_centralities.begin(),
+                               nearly_equal));
+      }
+    }
+  }
+};
+
+using Tests_MGKatzCentrality_File = Tests_MGKatzCentrality<cugraph::test::File_Usecase>;
+using Tests_MGKatzCentrality_Rmat = Tests_MGKatzCentrality<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGKatzCentrality_File, CheckInt32Int32FloatFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGKatzCentrality_Rmat, CheckInt32Int32FloatFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGKatzCentrality_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(KatzCentrality_Usecase{false}, KatzCentrality_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGKatzCentrality_Rmat,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(KatzCentrality_Usecase{false},
+                                             KatzCentrality_Usecase{true}),
+                           ::testing::Values(cugraph::test::Rmat_Usecase(
+                             10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_large_test,
+                         Tests_MGKatzCentrality_Rmat,
+                         ::testing::Combine(
+                           // disable correctness checks for large graphs
+                           ::testing::Values(KatzCentrality_Usecase{false, false},
+                                             KatzCentrality_Usecase{true, false}),
+                           ::testing::Values(cugraph::test::Rmat_Usecase(
+                             20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/mg_sssp_test.cpp b/cpp/tests/experimental/mg_sssp_test.cpp
new file mode 100644
index 00000000000..ce2556bed00
--- /dev/null
+++ b/cpp/tests/experimental/mg_sssp_test.cpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+struct SSSP_Usecase {
+  size_t source{0};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGSSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGSSSP() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running SSSP on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(SSSP_Usecase const& sssp_usecase, input_usecase_t const& input_usecase)
+  {
+    // 1. initialize handle
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        handle, true, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    ASSERT_TRUE(static_cast<vertex_t>(sssp_usecase.source) >= 0 &&
+                static_cast<vertex_t>(sssp_usecase.source) < mg_graph_view.get_number_of_vertices())
+      << "Invalid starting source.";
+
+    // 3. run MG SSSP
+
+    rmm::device_uvector<weight_t> d_mg_distances(mg_graph_view.get_number_of_local_vertices(),
+                                                 handle.get_stream());
+    rmm::device_uvector<vertex_t> d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(),
+                                                    handle.get_stream());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    cugraph::experimental::sssp(handle,
+                                mg_graph_view,
+                                d_mg_distances.data(),
+                                d_mg_predecessors.data(),
+                                static_cast<vertex_t>(sssp_usecase.source),
+                                std::numeric_limits<weight_t>::max());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG SSSP took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    // 4. copmare SG & MG results
+
+    if (sssp_usecase.check_correctness) {
+      // 4-1. aggregate MG results
+
+      auto d_mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
+        handle, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
+      auto d_mg_aggregate_distances =
+        cugraph::test::device_gatherv(handle, d_mg_distances.data(), d_mg_distances.size());
+      auto d_mg_aggregate_predecessors =
+        cugraph::test::device_gatherv(handle, d_mg_predecessors.data(), d_mg_predecessors.size());
+
+      if (handle.get_comms().get_rank() == int{0}) {
+        // 4-2. unrenumber MG results
+
+        cugraph::experimental::unrenumber_int_vertices<vertex_t, false>(
+          handle,
+          d_mg_aggregate_predecessors.data(),
+          d_mg_aggregate_predecessors.size(),
+          d_mg_aggregate_renumber_map_labels.data(),
+          vertex_t{0},
+          mg_graph_view.get_number_of_vertices(),
+          std::vector<vertex_t>{mg_graph_view.get_number_of_vertices()});
+
+        std::tie(std::ignore, d_mg_aggregate_distances) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_distances.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+        std::tie(std::ignore, d_mg_aggregate_predecessors) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_predecessors.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+
+        // 4-3. create SG graph
+
+        cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
+        std::tie(sg_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+            handle, true, false);
+
+        auto sg_graph_view = sg_graph.view();
+
+        ASSERT_TRUE(mg_graph_view.get_number_of_vertices() ==
+                    sg_graph_view.get_number_of_vertices());
+
+        // 4-4. run SG SSSP
+
+        rmm::device_uvector<weight_t> d_sg_distances(sg_graph_view.get_number_of_local_vertices(),
+                                                     handle.get_stream());
+        rmm::device_uvector<vertex_t> d_sg_predecessors(
+          sg_graph_view.get_number_of_local_vertices(), handle.get_stream());
+        vertex_t unrenumbered_source{};
+        raft::update_host(&unrenumbered_source,
+                          d_mg_aggregate_renumber_map_labels.data() + sssp_usecase.source,
+                          size_t{1},
+                          handle.get_stream());
+        handle.get_stream_view().synchronize();
+
+        cugraph::experimental::sssp(handle,
+                                    sg_graph_view,
+                                    d_sg_distances.data(),
+                                    d_sg_predecessors.data(),
+                                    unrenumbered_source,
+                                    std::numeric_limits<weight_t>::max());
+
+        // 4-5. compare
+
+        std::vector<edge_t> h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1);
+        std::vector<vertex_t> h_sg_indices(sg_graph_view.get_number_of_edges());
+        std::vector<weight_t> h_sg_weights(sg_graph_view.get_number_of_edges());
+        raft::update_host(h_sg_offsets.data(),
+                          sg_graph_view.get_matrix_partition_view().get_offsets(),
+                          sg_graph_view.get_number_of_vertices() + 1,
+                          handle.get_stream());
+        raft::update_host(h_sg_indices.data(),
+                          sg_graph_view.get_matrix_partition_view().get_indices(),
+                          sg_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+        raft::update_host(h_sg_weights.data(),
+                          *(sg_graph_view.get_matrix_partition_view().get_weights()),
+                          sg_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+
+        std::vector<weight_t> h_mg_aggregate_distances(mg_graph_view.get_number_of_vertices());
+        std::vector<vertex_t> h_mg_aggregate_predecessors(mg_graph_view.get_number_of_vertices());
+        raft::update_host(h_mg_aggregate_distances.data(),
+                          d_mg_aggregate_distances.data(),
+                          d_mg_aggregate_distances.size(),
+                          handle.get_stream());
+        raft::update_host(h_mg_aggregate_predecessors.data(),
+                          d_mg_aggregate_predecessors.data(),
+                          d_mg_aggregate_predecessors.size(),
+                          handle.get_stream());
+
+        std::vector<weight_t> h_sg_distances(sg_graph_view.get_number_of_vertices());
+        std::vector<vertex_t> h_sg_predecessors(sg_graph_view.get_number_of_vertices());
+        raft::update_host(
+          h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream());
+        raft::update_host(h_sg_predecessors.data(),
+                          d_sg_predecessors.data(),
+                          d_sg_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end());
+        auto epsilon            = *max_weight_element * weight_t{1e-6};
+        auto nearly_equal       = [epsilon](auto lhs, auto rhs) {
+          return std::fabs(lhs - rhs) < epsilon;
+        };
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_distances.begin(),
+                               h_mg_aggregate_distances.end(),
+                               h_sg_distances.begin(),
+                               nearly_equal));
+
+        for (size_t i = 0; i < h_mg_aggregate_predecessors.size(); ++i) {
+          if (h_mg_aggregate_predecessors[i] ==
+              cugraph::experimental::invalid_vertex_id<vertex_t>::value) {
+            ASSERT_TRUE(h_sg_predecessors[i] == h_mg_aggregate_predecessors[i])
+              << "vertex reachability does not match with the SG result.";
+          } else {
+            auto pred_distance = h_sg_distances[h_mg_aggregate_predecessors[i]];
+            bool found{false};
+            for (auto j = h_sg_offsets[h_mg_aggregate_predecessors[i]];
+                 j < h_sg_offsets[h_mg_aggregate_predecessors[i] + 1];
+                 ++j) {
+              if (h_sg_indices[j] == i) {
+                if (nearly_equal(pred_distance + h_sg_weights[j], h_sg_distances[i])) {
+                  found = true;
+                  break;
+                }
+              }
+            }
+            ASSERT_TRUE(found)
+              << "no edge from the predecessor vertex to this vertex with the matching weight.";
+          }
+        }
+      }
+    }
+  }
+};
+
+using Tests_MGSSSP_File = Tests_MGSSSP<cugraph::test::File_Usecase>;
+using Tests_MGSSSP_Rmat = Tests_MGSSSP<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGSSSP_File, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGSSSP_Rmat, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGSSSP_File,
+  ::testing::Values(
+    // enable correctness checks
+    std::make_tuple(SSSP_Usecase{0}, cugraph::test::File_Usecase("test/datasets/karate.mtx")),
+    std::make_tuple(SSSP_Usecase{0}, cugraph::test::File_Usecase("test/datasets/dblp.mtx")),
+    std::make_tuple(SSSP_Usecase{1000},
+                    cugraph::test::File_Usecase("test/datasets/wiki2003.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGSSSP_Rmat,
+                         ::testing::Values(
+                           // enable correctness checks
+                           std::make_tuple(SSSP_Usecase{0},
+                                           cugraph::test::Rmat_Usecase(
+                                             10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_large_test,
+                         Tests_MGSSSP_Rmat,
+                         ::testing::Values(
+                           // disable correctness checks for large graphs
+                           std::make_tuple(SSSP_Usecase{0, false},
+                                           cugraph::test::Rmat_Usecase(
+                                             20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/ms_bfs_test.cpp b/cpp/tests/experimental/ms_bfs_test.cpp
new file mode 100644
index 00000000000..a6fb306f1d8
--- /dev/null
+++ b/cpp/tests/experimental/ms_bfs_test.cpp
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/high_res_timer.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cuda_profiler_api.h>
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+typedef struct MsBfs_Usecase_t {
+  std::string graph_file_full_path{};
+  std::vector<int32_t> sources{};
+  int32_t radius;
+  bool test_weighted{false};
+
+  MsBfs_Usecase_t(std::string const& graph_file_path,
+                  std::vector<int32_t> const& sources,
+                  int32_t radius,
+                  bool test_weighted)
+    : sources(sources), radius(radius), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} MsBfs_Usecase;
+
+class Tests_MsBfs : public ::testing::TestWithParam<MsBfs_Usecase> {
+ public:
+  Tests_MsBfs() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(MsBfs_Usecase const& configuration)
+  {
+    auto n_seeds  = configuration.sources.size();
+    int n_streams = std::min(n_seeds, static_cast<std::size_t>(128));
+    raft::handle_t handle(n_streams);
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> graph(
+      handle);
+    std::tie(graph, std::ignore) = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+    auto graph_view = graph.view();
+    // Streams will allocate concurrently later
+    std::vector<rmm::device_uvector<vertex_t>> d_distances{};
+    std::vector<rmm::device_uvector<vertex_t>> d_predecessors{};
+
+    d_distances.reserve(n_seeds);
+    d_predecessors.reserve(n_seeds);
+    for (vertex_t i = 0; i < n_seeds; i++) {
+      // Allocations and operations are attached to the worker stream
+      rmm::device_uvector<vertex_t> tmp_distances(graph_view.get_number_of_vertices(),
+                                                  handle.get_internal_stream_view(i));
+      rmm::device_uvector<vertex_t> tmp_predecessors(graph_view.get_number_of_vertices(),
+                                                     handle.get_internal_stream_view(i));
+
+      d_distances.push_back(std::move(tmp_distances));
+      d_predecessors.push_back(std::move(tmp_predecessors));
+    }
+
+    std::vector<vertex_t> radius(n_seeds);
+    std::generate(radius.begin(), radius.end(), [n = 0]() mutable { return (n++ % 12 + 1); });
+
+    // warm up
+    cugraph::experimental::bfs(handle,
+                               graph_view,
+                               d_distances[0].begin(),
+                               d_predecessors[0].begin(),
+                               static_cast<vertex_t>(configuration.sources[0]),
+                               false,
+                               radius[0]);
+
+    // one by one
+    HighResTimer hr_timer;
+    hr_timer.start("bfs");
+    cudaProfilerStart();
+    for (vertex_t i = 0; i < n_seeds; i++) {
+      cugraph::experimental::bfs(handle,
+                                 graph_view,
+                                 d_distances[i].begin(),
+                                 d_predecessors[i].begin(),
+                                 static_cast<vertex_t>(configuration.sources[i]),
+                                 false,
+                                 radius[i]);
+    }
+    cudaProfilerStop();
+    hr_timer.stop();
+    hr_timer.display(std::cout);
+
+    // concurrent
+    hr_timer.start("bfs");
+    cudaProfilerStart();
+#pragma omp parallel for
+    for (vertex_t i = 0; i < n_seeds; i++) {
+      raft::handle_t light_handle(handle, i);
+      auto worker_stream_view = light_handle.get_stream_view();
+      cugraph::experimental::bfs(light_handle,
+                                 graph_view,
+                                 d_distances[i].begin(),
+                                 d_predecessors[i].begin(),
+                                 static_cast<vertex_t>(configuration.sources[i]),
+                                 false,
+                                 radius[i]);
+    }
+
+    cudaProfilerStop();
+    hr_timer.stop();
+    hr_timer.display(std::cout);
+  }
+};
+
+TEST_P(Tests_MsBfs, DISABLED_CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+/*
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_MsBfs,
+  ::testing::Values(
+    MsBfs_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{0}, 1, false),
+    MsBfs_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{0}, 2, false),
+    MsBfs_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{1}, 3, false),
+    MsBfs_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{10, 0, 5}, 2, false),
+    MsBfs_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{9, 3, 10}, 2, false),
+    MsBfs_Usecase(
+      "test/datasets/karate.mtx", std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true)));
+*/
+// For perf analysis
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_MsBfs,
+  ::testing::Values(
+    MsBfs_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector<int32_t>{363617}, 2, false),
+    MsBfs_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx",
+      std::vector<int32_t>{
+        363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755},
+      2,
+      false),
+    MsBfs_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx",
+      std::vector<int32_t>{
+        363617,  722214,  2337449, 2510183, 2513389, 225853,  2035807, 3836330, 1865496, 28755,
+        2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686,
+        2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102,
+        4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953,  2388331, 520808,
+        3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680,  847662,  3277365, 3957318,
+        3455123, 2454259, 670953,  4465677, 1027332, 2560721, 89061,   1163406, 3109528, 3221856,
+        4714426, 2382774, 37828,   4433616, 3283229, 591911,  4200188, 442522,  872207,  2437601,
+        741003,  266241,  914618,  3626195, 2021080, 4679624, 777476,  2527796, 1114017, 640142,
+        49259,   4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062,
+        2029646, 4575891, 1488598, 79105,   4827273, 3795434, 4647518, 4733397, 3980718, 1184627},
+      2,
+      false),
+    MsBfs_Usecase(
+      "test/datasets/soc-LiveJournal1.mtx",
+      std::vector<int32_t>{
+        363617,  722214,  2337449, 2510183, 2513389, 225853,  2035807, 3836330, 1865496, 28755,
+        2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686,
+        2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102,
+        4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953,  2388331, 520808,
+        3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680,  847662,  3277365, 3957318,
+        3455123, 2454259, 670953,  4465677, 1027332, 2560721, 89061,   1163406, 3109528, 3221856,
+        4714426, 2382774, 37828,   4433616, 3283229, 591911,  4200188, 442522,  872207,  2437601,
+        741003,  266241,  914618,  3626195, 2021080, 4679624, 777476,  2527796, 1114017, 640142,
+        49259,   4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062,
+        2029646, 4575891, 1488598, 79105,   4827273, 3795434, 4647518, 4733397, 3980718, 1184627,
+        984983,  3114832, 1967741, 1599818, 144593,  2698770, 2889449, 2495550, 1053813, 1193622,
+        686026,  3989015, 2040719, 4693428, 3190376, 2926728, 3399030, 1664419, 662429,  4526841,
+        2186957, 3752558, 2440046, 2930226, 3633006, 4058166, 3137060, 3499296, 2126343, 148971,
+        2199672, 275811,  2813976, 2274536, 1189239, 1335942, 2465624, 2596042, 829684,  193400,
+        2682845, 3691697, 4022437, 4051170, 4195175, 2876420, 3984220, 2174475, 326134,  2606530,
+        2493046, 4706121, 1498980, 4576225, 1271339, 44832,   1875673, 4664940, 134931,  736397,
+        4333554, 2751031, 2163610, 2879676, 3174153, 3317403, 2052464, 1881883, 4757859, 3596257,
+        2358088, 2578758, 447504,  590720,  1717038, 1869795, 1133885, 3027521, 840312,  2818881,
+        3654321, 2730947, 353585,  1134903, 2223378, 1508824, 3662521, 1363776, 2712071, 288441,
+        1204581, 3502242, 4645567, 2767267, 1514366, 3956099, 1422145, 1216608, 2253360, 189132,
+        4238225, 1345783, 451571,  1599442, 3237284, 4711405, 929446,  1857675, 150759,  1277633,
+        761210,  138628,  1026833, 2599544, 2464737, 989203,  3399615, 2144292, 216142,  637312,
+        2044964, 716256,  1660632, 1762919, 4784357, 2213415, 2764769, 291806,  609772,  3264819,
+        1870953, 1516385, 235647,  1045474, 2664957, 819095,  1824119, 4045271, 4448109, 1676788,
+        4285177, 1580502, 3546548, 2771971, 3927086, 1339779, 3156204, 1730998, 1172522, 2433024,
+        4533449, 479930,  2010695, 672994,  3542039, 3176455, 26352,   2137735, 866910,  4410835,
+        2623982, 3603159, 2555625, 2765653, 267865,  2015523, 1009052, 4713994, 1600667, 2176195,
+        3179631, 4570390, 2018424, 3356384, 1784287, 894861,  3622099, 1647273, 3044136, 950354,
+        1491760, 3416929, 3757300, 2244912, 4129215, 1600848, 3867343, 72329,   919189,  992521,
+        3445975, 4712557, 4680974, 188419,  2612093, 1991268, 3566207, 2281468, 3859078, 2492806,
+        3398628, 763441,  2679107, 2554420, 2130132, 4664374, 1182901, 3890770, 4714667, 4209303,
+        4013060, 3617653, 2040022, 3296519, 4190671, 1693353, 2678411, 3788834, 2781815, 191965,
+        1083926, 503974,  3529226, 1650522, 1900976, 542080,  3423929, 3418905, 878165,  4701703,
+        3022790, 4316365, 76365,   4053672, 1358185, 3830478, 4445661, 3210024, 1895915, 4541133,
+        2938808, 562788,  3920065, 1458776, 4052046, 2967475, 1092809, 3203538, 159626,  3399464,
+        214467,  3343982, 1811854, 3189045, 4272117, 4701563, 424807,  4341116, 760545,  4674683,
+        1538018, 386762,  194237,  2162719, 1694433, 943728,  2389036, 2196653, 3085571, 1513424,
+        3689413, 3278747, 4197291, 3324063, 3651090, 1737936, 2768803, 2768889, 3108096, 4311775,
+        3569480, 886705,  733256,  2477493, 1735412, 2960895, 1983781, 1861797, 3566460, 4537673,
+        1164093, 3499764, 4553071, 3518985, 847658,  918948,  2922351, 1056144, 652895,  1013195,
+        780505,  1702928, 3562838, 1432719, 2405207, 1054920, 641647,  2240939, 3617702, 383165,
+        652641,  879593,  1810739, 2096385, 4497865, 4768530, 1743968, 3582014, 1025009, 3002122,
+        2422190, 527647,  1251821, 2571153, 4095874, 3705333, 3637407, 1385567, 4043855, 4041930,
+        2433139, 1710383, 1127734, 4362316, 711588,  817839,  3214775, 910077,  1313768, 2382229,
+        16864,   2081770, 3095420, 3195272, 548711,  2259860, 1167323, 2435974, 425238,  2085179,
+        2630042, 2632881, 2867923, 3703565, 1037695, 226617,  4379130, 1541468, 3581937, 605965,
+        1137674, 4655221, 4769963, 1394370, 4425315, 2990132, 2364485, 1561137, 2713384, 481509,
+        2900382, 934766,  2986774, 1767669, 298593,  2502539, 139296,  3794229, 4002180, 4718138,
+        2909238, 423691,  3023810, 2784924, 2760160, 1971980, 316683,  3828090, 3253691, 4839313,
+        1203624, 584938,  3901482, 1747543, 1572737, 3533226, 774708,  1691195, 1037110, 1557763,
+        225120,  4424243, 3524086, 1717663, 4332507, 3513592, 4274932, 1232118, 873498,  1416042,
+        2488925, 111391,  4704545, 4492545, 445317,  1584812, 2187737, 2471948, 3731678, 219255,
+        2282627, 2589971, 2372185, 4609096, 3673961, 2524410, 12823,   2437155, 3015974, 4188352,
+        3184084, 3690756, 1222341, 1278376, 3652030, 4162647, 326548,  3930062, 3926100, 1551222,
+        2722165, 4526695, 3997534, 4815513, 3139056, 2547644, 3028915, 4149092, 3656554, 2691582,
+        2676699, 1878842, 260174,  3129900, 4379993, 182347,  2189338, 3783616, 2616666, 2596952,
+        243007,  4179282, 2730,    1939894, 2332032, 3335636, 182332,  3112260, 2174584, 587481,
+        4527368, 3154106, 3403059, 673206,  2150292, 446521,  1600204, 4819428, 2591357, 48490,
+        2917012, 2285923, 1072926, 2824281, 4364250, 956033,  311938,  37251,   3729300, 2726300,
+        644966,  1623020, 1419070, 4646747, 2417222, 2680238, 2561083, 1793801, 2349366, 339747,
+        611366,  4684147, 4356907, 1277161, 4510381, 3218352, 4161658, 3200733, 1172372, 3997786,
+        3169266, 3353418, 2248955, 2875885, 2365369, 498208,  2968066, 2681505, 2059048, 2097106,
+        3607540, 1121504, 2016789, 1762605, 3138431, 866081,  3705757, 3833066, 2599788, 760816,
+        4046672, 1544367, 2983906, 4842911, 209599,  1250954, 3333704, 561212,  4674336, 2831841,
+        3690724, 2929360, 4830834, 1177524, 2487687, 3525137, 875283,  651241,  2110742, 1296646,
+        1543739, 4349417, 2384725, 1931751, 1519208, 1520034, 3385008, 3219962, 734912,  170230,
+        1741419, 729913,  2860117, 2362381, 1199807, 2424230, 177824,  125948,  2722701, 4687548,
+        1140771, 3232742, 4522020, 4376360, 1125603, 590312,  2481884, 138951,  4086775, 615155,
+        3395781, 4587272, 283209,  568470,  4296185, 4344150, 2454321, 2672602, 838828,  4051647,
+        1709120, 3074610, 693235,  4356087, 3018806, 239410,  2431497, 691186,  766276,  4462126,
+        859155,  2370304, 1571808, 1938673, 1694955, 3871296, 4245059, 3987376, 301524,  2512461,
+        3410437, 3300380, 684922,  4581995, 3599557, 683515,  1850634, 3704678, 1937490, 2035591,
+        3718533, 2065879, 3160765, 1467884, 1912241, 2501509, 3668572, 3390469, 2501150, 612319,
+        713633,  1976262, 135946,  3641535, 632083,  13414,   4217765, 4137712, 2550250, 3281035,
+        4179598, 961045,  2020694, 4380006, 1345936, 289162,  1359035, 770872,  4509911, 3947317,
+        4719693, 248568,  2625660, 1237232, 2153208, 4814282, 1259954, 3677369, 861222,  2883506,
+        3339149, 3998335, 491017,  1609022, 2648112, 742132,  649609,  4206953, 3131106, 3504814,
+        3344486, 611721,  3215620, 2856233, 4447505, 1949222, 1868345, 712710,  6966,    4730666,
+        3181872, 2972889, 3038521, 3525444, 4385208, 1845613, 1124187, 2030476, 4468651, 2478792,
+        3473580, 3783357, 1852991, 1648485, 871319,  1670723, 4458328, 3218600, 1811100, 3443356,
+        2233873, 3035207, 2548692, 3337891, 3773674, 1552957, 4782811, 3144712, 3523466, 1491315,
+        3955852, 1838410, 3164028, 1092543, 776459,  2959379, 2541744, 4064418, 3908320, 2854145,
+        3960709, 1348188, 977678,  853619,  1304291, 2848702, 1657913, 1319826, 3322665, 788037,
+        2913686, 4471279, 1766285, 348304,  56570,   1892118, 4017244, 401006,  3524539, 4310134,
+        1624693, 4081113, 957511,  849400,  129975,  2616130, 378537,  1556787, 3916162, 1039980,
+        4407778, 2027690, 4213675, 839863,  683134,  75805,   2493150, 4215796, 81587,   751845,
+        1255588, 1947964, 1950470, 859401,  3077088, 3931110, 2316256, 1523761, 4527477, 4237511,
+        1123513, 4209796, 3584772, 4250563, 2091754, 1618766, 2139944, 4525352, 382159,  2955887,
+        41760,   2313998, 496912,  3791570, 3904792, 3613654, 873959,  127076,  2537797, 2458107,
+        4543265, 3661909, 26828,   271816,  17854,   2461269, 1776042, 1573899, 3409957, 4335712,
+        4534313, 3392751, 1230124, 2159031, 4444015, 3373087, 3848014, 2026600, 1382747, 3537242,
+        4536743, 4714155, 3788371, 3570849, 173741,  211962,  4377778, 119369,  2856973, 2945854,
+        1508054, 4503932, 3141566, 1842177, 3448683, 3384614, 2886508, 1573965, 990618,  3053734,
+        2918742, 4508753, 1032149, 60943,   4291620, 722607,  2883224, 169359,  4356585, 3725543,
+        3678729, 341673,  3592828, 4077251, 3382936, 3885685, 4630994, 1286698, 4449616, 1138430,
+        3113385, 4660578, 2539973, 4562286, 4085089, 494737,  3967610, 2130702, 1823755, 1369324,
+        3796951, 956299,  141730,  935144,  4381893, 4412545, 1382250, 3024476, 2364546, 3396164,
+        3573511, 314081,  577688,  4154135, 1567018, 4047761, 2446220, 1148833, 4842497, 3967186,
+        1175290, 3749667, 1209593, 3295627, 3169065, 2460328, 1838486, 1436923, 2843887, 3676426,
+        2079145, 2975635, 535071,  4287509, 3281107, 39606,   3115500, 3204573, 722131,  3124073},
+      2,
+      false)));
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp
index 4763249aa9e..105cf38acef 100644
--- a/cpp/tests/experimental/pagerank_test.cpp
+++ b/cpp/tests/experimental/pagerank_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include <utilities/high_res_clock.h>
 #include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
-#include <algorithms.hpp>
-#include <experimental/graph.hpp>
-#include <experimental/graph_view.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -35,15 +39,20 @@
 #include <random>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void pagerank_reference(edge_t* offsets,
-                        vertex_t* indices,
-                        weight_t* weights,
-                        vertex_t* personalization_vertices,
-                        result_t* personalization_values,
+void pagerank_reference(edge_t const* offsets,
+                        vertex_t const* indices,
+                        std::optional<weight_t const*> weights,
+                        std::optional<vertex_t const*> personalization_vertices,
+                        std::optional<result_t const*> personalization_values,
+                        std::optional<vertex_t> personalization_vector_size,
                         result_t* pageranks,
                         vertex_t num_vertices,
-                        vertex_t personalization_vector_size,
                         result_t alpha,
                         result_t epsilon,
                         size_t max_iterations,
@@ -52,7 +61,11 @@ void pagerank_reference(edge_t* offsets,
   if (num_vertices == 0) { return; }
 
   if (has_initial_guess) {
-    auto sum = std::accumulate(pageranks, pageranks + num_vertices, result_t{0.0});
+    // use a double type counter (instead of result_t) to accumulate as std::accumulate is
+    // inaccurate in adding a large number of comparably sized numbers. In C++17 or later,
+    // std::reduce may be a better option.
+    auto sum =
+      static_cast<result_t>(std::accumulate(pageranks, pageranks + num_vertices, double{0.0}));
     ASSERT_TRUE(sum > 0.0);
     std::for_each(pageranks, pageranks + num_vertices, [sum](auto& val) { val /= sum; });
   } else {
@@ -61,20 +74,23 @@ void pagerank_reference(edge_t* offsets,
     });
   }
 
-  if (personalization_vertices != nullptr) {
-    auto sum = std::accumulate(
-      personalization_values, personalization_values + personalization_vector_size, result_t{0.0});
-    ASSERT_TRUE(sum > 0.0);
-    std::for_each(personalization_values,
-                  personalization_values + personalization_vector_size,
-                  [sum](auto& val) { val /= sum; });
+  result_t personalization_sum{0.0};
+  if (personalization_vertices) {
+    // use a double type counter (instead of result_t) to accumulate as std::accumulate is
+    // inaccurate in adding a large number of comparably sized numbers. In C++17 or later,
+    // std::reduce may be a better option.
+    personalization_sum =
+      static_cast<result_t>(std::accumulate(*personalization_values,
+                                            *personalization_values + *personalization_vector_size,
+                                            double{0.0}));
+    ASSERT_TRUE(personalization_sum > 0.0);
   }
 
   std::vector<weight_t> out_weight_sums(num_vertices, result_t{0.0});
   for (vertex_t i = 0; i < num_vertices; ++i) {
     for (auto j = *(offsets + i); j < *(offsets + i + 1); ++j) {
       auto nbr = indices[j];
-      auto w   = weights != nullptr ? weights[j] : 1.0;
+      auto w   = weights ? (*weights)[j] : weight_t{1.0};
       out_weight_sums[nbr] += w;
     }
   }
@@ -91,18 +107,19 @@ void pagerank_reference(edge_t* offsets,
       pageranks[i] = result_t{0.0};
       for (auto j = *(offsets + i); j < *(offsets + i + 1); ++j) {
         auto nbr = indices[j];
-        auto w   = weights != nullptr ? weights[j] : result_t{1.0};
+        auto w   = weights ? (*weights)[j] : result_t{1.0};
         pageranks[i] += alpha * old_pageranks[nbr] * (w / out_weight_sums[nbr]);
       }
-      if (personalization_vertices == nullptr) {
+      if (!personalization_vertices) {
         pageranks[i] +=
           (dangling_sum * alpha + (1.0 - alpha)) / static_cast<result_t>(num_vertices);
       }
     }
-    if (personalization_vertices != nullptr) {
-      for (vertex_t i = 0; i < personalization_vector_size; ++i) {
-        auto v = personalization_vertices[i];
-        pageranks[v] += (dangling_sum * alpha + (1.0 - alpha)) * personalization_values[i];
+    if (personalization_vertices) {
+      for (vertex_t i = 0; i < *personalization_vector_size; ++i) {
+        auto v = (*personalization_vertices)[i];
+        pageranks[v] += (dangling_sum * alpha + (1.0 - alpha)) *
+                        ((*personalization_values)[i] / personalization_sum);
       }
     }
     result_t diff_sum{0.0};
@@ -117,25 +134,15 @@ void pagerank_reference(edge_t* offsets,
   return;
 }
 
-typedef struct PageRank_Usecase_t {
-  std::string graph_file_full_path{};
+struct PageRank_Usecase {
   double personalization_ratio{0.0};
   bool test_weighted{false};
+  bool check_correctness{true};
+};
 
-  PageRank_Usecase_t(std::string const& graph_file_path,
-                     double personalization_ratio,
-                     bool test_weighted)
-    : personalization_ratio(personalization_ratio), test_weighted(test_weighted)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
-} PageRank_Usecase;
-
-class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
+template <typename input_usecase_t>
+class Tests_PageRank
+  : public ::testing::TestWithParam<std::tuple<PageRank_Usecase, input_usecase_t>> {
  public:
   Tests_PageRank() {}
   static void SetupTestCase() {}
@@ -145,163 +152,317 @@ class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
   virtual void TearDown() {}
 
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-  void run_current_test(PageRank_Usecase const& configuration)
+  void run_current_test(PageRank_Usecase const& pagerank_usecase,
+                        input_usecase_t const& input_usecase)
   {
+    constexpr bool renumber = true;
+
     raft::handle_t handle{};
+    HighResClock hr_clock{};
 
-    auto graph =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true>(
-        handle, configuration.graph_file_full_path, configuration.test_weighted);
-    auto graph_view = graph.view();
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_weights{};
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    if (graph_view.is_weighted()) {
-      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
-      raft::update_host(h_weights.data(),
-                        graph_view.weights(),
-                        graph_view.get_number_of_edges(),
-                        handle.get_stream());
+    auto [graph, d_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, false>(
+        handle, pagerank_usecase.test_weighted, renumber);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
     }
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
-    std::vector<vertex_t> h_personalization_vertices{};
-    std::vector<result_t> h_personalization_values{};
-    if (configuration.personalization_ratio > 0.0) {
-      std::random_device r{};
-      std::default_random_engine generator{r()};
+    auto graph_view = graph.view();
+
+    std::optional<std::vector<vertex_t>> h_personalization_vertices{std::nullopt};
+    std::optional<std::vector<result_t>> h_personalization_values{std::nullopt};
+    if (pagerank_usecase.personalization_ratio > 0.0) {
+      std::default_random_engine generator{};
       std::uniform_real_distribution<double> distribution{0.0, 1.0};
-      h_personalization_vertices.resize(graph_view.get_number_of_local_vertices());
-      std::iota(h_personalization_vertices.begin(),
-                h_personalization_vertices.end(),
+      h_personalization_vertices = std::vector<vertex_t>(graph_view.get_number_of_local_vertices());
+      std::iota((*h_personalization_vertices).begin(),
+                (*h_personalization_vertices).end(),
                 graph_view.get_local_vertex_first());
-      h_personalization_vertices.erase(
-        std::remove_if(h_personalization_vertices.begin(),
-                       h_personalization_vertices.end(),
-                       [&generator, &distribution, configuration](auto v) {
-                         return distribution(generator) >= configuration.personalization_ratio;
-                       }),
-        h_personalization_vertices.end());
-      h_personalization_values.resize(h_personalization_vertices.size());
-      std::for_each(h_personalization_values.begin(),
-                    h_personalization_values.end(),
+      (*h_personalization_vertices)
+        .erase(std::remove_if((*h_personalization_vertices).begin(),
+                              (*h_personalization_vertices).end(),
+                              [&generator, &distribution, pagerank_usecase](auto v) {
+                                return distribution(generator) >=
+                                       pagerank_usecase.personalization_ratio;
+                              }),
+               (*h_personalization_vertices).end());
+      h_personalization_values = std::vector<result_t>((*h_personalization_vertices).size());
+      std::for_each((*h_personalization_values).begin(),
+                    (*h_personalization_values).end(),
                     [&distribution, &generator](auto& val) { val = distribution(generator); });
-      auto sum = std::accumulate(
-        h_personalization_values.begin(), h_personalization_values.end(), result_t{0.0});
-      std::for_each(h_personalization_values.begin(),
-                    h_personalization_values.end(),
+      // use a double type counter (instead of result_t) to accumulate as std::accumulate is
+      // inaccurate in adding a large number of comparably sized numbers. In C++17 or later,
+      // std::reduce may be a better option.
+      auto sum = static_cast<result_t>(std::accumulate(
+        (*h_personalization_values).begin(), (*h_personalization_values).end(), double{0.0}));
+      std::for_each((*h_personalization_values).begin(),
+                    (*h_personalization_values).end(),
                     [sum](auto& val) { val /= sum; });
     }
 
-    rmm::device_uvector<vertex_t> d_personalization_vertices(h_personalization_vertices.size(),
-                                                             handle.get_stream());
-    rmm::device_uvector<result_t> d_personalization_values(d_personalization_vertices.size(),
-                                                           handle.get_stream());
-    if (d_personalization_vertices.size() > 0) {
-      raft::update_device(d_personalization_vertices.data(),
-                          h_personalization_vertices.data(),
-                          h_personalization_vertices.size(),
+    auto d_personalization_vertices =
+      h_personalization_vertices ? std::make_optional<rmm::device_uvector<vertex_t>>(
+                                     (*h_personalization_vertices).size(), handle.get_stream())
+                                 : std::nullopt;
+    auto d_personalization_values = h_personalization_values
+                                      ? std::make_optional<rmm::device_uvector<result_t>>(
+                                          (*d_personalization_vertices).size(), handle.get_stream())
+                                      : std::nullopt;
+    if (d_personalization_vertices) {
+      raft::update_device((*d_personalization_vertices).data(),
+                          (*h_personalization_vertices).data(),
+                          (*h_personalization_vertices).size(),
                           handle.get_stream());
-      raft::update_device(d_personalization_values.data(),
-                          h_personalization_values.data(),
-                          h_personalization_values.size(),
+      raft::update_device((*d_personalization_values).data(),
+                          (*h_personalization_values).data(),
+                          (*h_personalization_values).size(),
                           handle.get_stream());
     }
 
-    std::vector<result_t> h_reference_pageranks(graph_view.get_number_of_vertices());
-
     result_t constexpr alpha{0.85};
     result_t constexpr epsilon{1e-6};
 
-    pagerank_reference(h_offsets.data(),
-                       h_indices.data(),
-                       h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
-                       h_personalization_vertices.data(),
-                       h_personalization_values.data(),
-                       h_reference_pageranks.data(),
-                       graph_view.get_number_of_vertices(),
-                       static_cast<vertex_t>(h_personalization_vertices.size()),
-                       alpha,
-                       epsilon,
-                       std::numeric_limits<size_t>::max(),
-                       false);
-
     rmm::device_uvector<result_t> d_pageranks(graph_view.get_number_of_vertices(),
                                               handle.get_stream());
 
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    cugraph::experimental::pagerank(handle,
-                                    graph_view,
-                                    static_cast<weight_t*>(nullptr),
-                                    d_personalization_vertices.data(),
-                                    d_personalization_values.data(),
-                                    static_cast<vertex_t>(d_personalization_vertices.size()),
-                                    d_pageranks.begin(),
-                                    alpha,
-                                    epsilon,
-                                    std::numeric_limits<size_t>::max(),
-                                    false,
-                                    false);
-
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    std::vector<result_t> h_cugraph_pageranks(graph_view.get_number_of_vertices());
-
-    raft::update_host(
-      h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    auto threshold_ratio = 1e-3;
-    auto threshold_magnitude =
-      (epsilon / static_cast<result_t>(graph_view.get_number_of_vertices())) * threshold_ratio;
-    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      auto diff = std::abs(lhs - rhs);
-      return (diff < std::max(lhs, rhs) * threshold_ratio) || (diff < threshold_magnitude);
-    };
-
-    ASSERT_TRUE(std::equal(h_reference_pageranks.begin(),
-                           h_reference_pageranks.end(),
-                           h_cugraph_pageranks.begin(),
-                           nearly_equal))
-      << "PageRank values do not match with the reference values.";
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    cugraph::experimental::pagerank<vertex_t, edge_t, weight_t>(
+      handle,
+      graph_view,
+      std::nullopt,
+      d_personalization_vertices
+        ? std::optional<vertex_t const*>{(*d_personalization_vertices).data()}
+        : std::nullopt,
+      d_personalization_values ? std::optional<result_t const*>{(*d_personalization_values).data()}
+                               : std::nullopt,
+      d_personalization_vertices ? std::optional<vertex_t>{(*d_personalization_vertices).size()}
+                                 : std::nullopt,
+      d_pageranks.data(),
+      alpha,
+      epsilon,
+      std::numeric_limits<size_t>::max(),
+      false,
+      false);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "PageRank took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    if (pagerank_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, false>(
+            handle, pagerank_usecase.test_weighted, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      auto h_weights = unrenumbered_graph_view.is_weighted()
+                         ? std::make_optional<std::vector<weight_t>>(
+                             unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0})
+                         : std::nullopt;
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      if (h_weights) {
+        raft::update_host((*h_weights).data(),
+                          *(unrenumbered_graph_view.get_matrix_partition_view().get_weights()),
+                          unrenumbered_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+      }
+
+      auto h_unrenumbered_personalization_vertices =
+        d_personalization_vertices
+          ? std::make_optional<std::vector<vertex_t>>((*d_personalization_vertices).size())
+          : std::nullopt;
+      auto h_unrenumbered_personalization_values =
+        d_personalization_vertices
+          ? std::make_optional<std::vector<result_t>>((*d_personalization_vertices).size())
+          : std::nullopt;
+      if (h_unrenumbered_personalization_vertices) {
+        if (renumber) {
+          rmm::device_uvector<vertex_t> d_unrenumbered_personalization_vertices(
+            (*d_personalization_vertices).size(), handle.get_stream());
+          rmm::device_uvector<result_t> d_unrenumbered_personalization_values(
+            d_unrenumbered_personalization_vertices.size(), handle.get_stream());
+          raft::copy_async(d_unrenumbered_personalization_vertices.data(),
+                           (*d_personalization_vertices).data(),
+                           (*d_personalization_vertices).size(),
+                           handle.get_stream());
+          raft::copy_async(d_unrenumbered_personalization_values.data(),
+                           (*d_personalization_values).data(),
+                           (*d_personalization_values).size(),
+                           handle.get_stream());
+          cugraph::experimental::unrenumber_local_int_vertices(
+            handle,
+            d_unrenumbered_personalization_vertices.data(),
+            d_unrenumbered_personalization_vertices.size(),
+            (*d_renumber_map_labels).data(),
+            vertex_t{0},
+            graph_view.get_number_of_vertices());
+          std::tie(d_unrenumbered_personalization_vertices, d_unrenumbered_personalization_values) =
+            cugraph::test::sort_by_key(handle,
+                                       d_unrenumbered_personalization_vertices.data(),
+                                       d_unrenumbered_personalization_values.data(),
+                                       d_unrenumbered_personalization_vertices.size());
+
+          raft::update_host((*h_unrenumbered_personalization_vertices).data(),
+                            d_unrenumbered_personalization_vertices.data(),
+                            d_unrenumbered_personalization_vertices.size(),
+                            handle.get_stream());
+          raft::update_host((*h_unrenumbered_personalization_values).data(),
+                            d_unrenumbered_personalization_values.data(),
+                            d_unrenumbered_personalization_values.size(),
+                            handle.get_stream());
+        } else {
+          raft::update_host((*h_unrenumbered_personalization_vertices).data(),
+                            (*d_personalization_vertices).data(),
+                            (*d_personalization_vertices).size(),
+                            handle.get_stream());
+          raft::update_host((*h_unrenumbered_personalization_values).data(),
+                            (*d_personalization_values).data(),
+                            (*d_personalization_values).size(),
+                            handle.get_stream());
+        }
+      }
+
+      handle.get_stream_view().synchronize();
+
+      std::vector<result_t> h_reference_pageranks(unrenumbered_graph_view.get_number_of_vertices());
+
+      pagerank_reference(
+        h_offsets.data(),
+        h_indices.data(),
+        h_weights ? std::optional<weight_t const*>{(*h_weights).data()} : std::nullopt,
+        h_unrenumbered_personalization_vertices
+          ? std::optional<vertex_t const*>{(*h_unrenumbered_personalization_vertices).data()}
+          : std::nullopt,
+        h_unrenumbered_personalization_values
+          ? std::optional<result_t const*>{(*h_unrenumbered_personalization_values).data()}
+          : std::nullopt,
+        h_unrenumbered_personalization_vertices
+          ? std::optional<vertex_t>{static_cast<vertex_t>(
+              (*h_unrenumbered_personalization_vertices).size())}
+          : std::nullopt,
+        h_reference_pageranks.data(),
+        unrenumbered_graph_view.get_number_of_vertices(),
+        alpha,
+        epsilon,
+        std::numeric_limits<size_t>::max(),
+        false);
+
+      std::vector<result_t> h_cugraph_pageranks(graph_view.get_number_of_vertices());
+      if (renumber) {
+        rmm::device_uvector<result_t> d_unrenumbered_pageranks(size_t{0}, handle.get_stream());
+        std::tie(std::ignore, d_unrenumbered_pageranks) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_pageranks.data(),
+                                     (*d_renumber_map_labels).size());
+        raft::update_host(h_cugraph_pageranks.data(),
+                          d_unrenumbered_pageranks.data(),
+                          d_unrenumbered_pageranks.size(),
+                          handle.get_stream());
+      } else {
+        raft::update_host(
+          h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      auto threshold_ratio = 1e-3;
+      auto threshold_magnitude =
+        (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
+        threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
+      auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
+
+      ASSERT_TRUE(std::equal(h_reference_pageranks.begin(),
+                             h_reference_pageranks.end(),
+                             h_cugraph_pageranks.begin(),
+                             nearly_equal))
+        << "PageRank values do not match with the reference values.";
+    }
   }
 };
 
+using Tests_PageRank_File = Tests_PageRank<cugraph::test::File_Usecase>;
+using Tests_PageRank_Rmat = Tests_PageRank<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_PageRank_File, CheckInt32Int32FloatFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
+}
+
 // FIXME: add tests for type combinations
-TEST_P(Tests_PageRank, CheckInt32Int32FloatFloat)
+TEST_P(Tests_PageRank_Rmat, CheckInt32Int32FloatFloat)
 {
-  run_current_test<int32_t, int32_t, float, float>(GetParam());
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
 }
 
-INSTANTIATE_TEST_CASE_P(
-  simple_test,
-  Tests_PageRank,
-  ::testing::Values(PageRank_Usecase("test/datasets/karate.mtx", 0.0, false),
-                    PageRank_Usecase("test/datasets/karate.mtx", 0.5, false),
-                    PageRank_Usecase("test/datasets/karate.mtx", 0.0, true),
-                    PageRank_Usecase("test/datasets/karate.mtx", 0.5, true),
-                    PageRank_Usecase("test/datasets/web-Google.mtx", 0.0, false),
-                    PageRank_Usecase("test/datasets/web-Google.mtx", 0.5, false),
-                    PageRank_Usecase("test/datasets/web-Google.mtx", 0.0, true),
-                    PageRank_Usecase("test/datasets/web-Google.mtx", 0.5, true),
-                    PageRank_Usecase("test/datasets/ljournal-2008.mtx", 0.0, false),
-                    PageRank_Usecase("test/datasets/ljournal-2008.mtx", 0.5, false),
-                    PageRank_Usecase("test/datasets/ljournal-2008.mtx", 0.0, true),
-                    PageRank_Usecase("test/datasets/ljournal-2008.mtx", 0.5, true),
-                    PageRank_Usecase("test/datasets/webbase-1M.mtx", 0.0, false),
-                    // FIXME: Re-enable test after failures are addressed
-                    // PageRank_Usecase("test/datasets/webbase-1M.mtx", 0.5, false),
-                    PageRank_Usecase("test/datasets/webbase-1M.mtx", 0.0, true),
-                    PageRank_Usecase("test/datasets/webbase-1M.mtx", 0.5, true)));
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_PageRank_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(PageRank_Usecase{0.0, false},
+                      PageRank_Usecase{0.5, false},
+                      PageRank_Usecase{0.0, true},
+                      PageRank_Usecase{0.5, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_PageRank_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(PageRank_Usecase{0.0, false},
+                      PageRank_Usecase{0.5, false},
+                      PageRank_Usecase{0.0, true},
+                      PageRank_Usecase{0.5, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_tests,
+  Tests_PageRank_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(PageRank_Usecase{0.0, false, false},
+                      PageRank_Usecase{0.5, false, false},
+                      PageRank_Usecase{0.0, true, false},
+                      PageRank_Usecase{0.5, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp
index 49eaca56f56..07947a7a059 100644
--- a/cpp/tests/experimental/sssp_test.cpp
+++ b/cpp/tests/experimental/sssp_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include <utilities/high_res_clock.h>
 #include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
-#include <algorithms.hpp>
-#include <experimental/graph.hpp>
-#include <experimental/graph_view.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -28,31 +32,38 @@
 
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iterator>
 #include <limits>
 #include <queue>
 #include <tuple>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 // Dijkstra's algorithm
 template <typename vertex_t, typename edge_t, typename weight_t>
-void sssp_reference(edge_t* offsets,
-                    vertex_t* indices,
-                    weight_t* weights,
+void sssp_reference(edge_t const* offsets,
+                    vertex_t const* indices,
+                    weight_t const* weights,
                     weight_t* distances,
                     vertex_t* predecessors,
                     vertex_t num_vertices,
                     vertex_t source,
                     weight_t cutoff = std::numeric_limits<weight_t>::max())
 {
-  using queue_iterm_t = std::tuple<weight_t, vertex_t>;
+  using queue_item_t = std::tuple<weight_t, vertex_t>;
 
   std::fill(distances, distances + num_vertices, std::numeric_limits<weight_t>::max());
-  std::fill(predecessors, predecessors + num_vertices, cugraph::invalid_vertex_id<vertex_t>::value);
+  std::fill(predecessors,
+            predecessors + num_vertices,
+            cugraph::experimental::invalid_vertex_id<vertex_t>::value);
 
   *(distances + source) = weight_t{0.0};
-  std::priority_queue<queue_iterm_t, std::vector<queue_iterm_t>, std::greater<queue_iterm_t>>
-    queue{};
+  std::priority_queue<queue_item_t, std::vector<queue_item_t>, std::greater<queue_item_t>> queue{};
   queue.push(std::make_tuple(weight_t{0.0}, source));
 
   while (queue.size() > 0) {
@@ -78,21 +89,13 @@ void sssp_reference(edge_t* offsets,
   return;
 }
 
-typedef struct SSSP_Usecase_t {
-  std::string graph_file_full_path{};
-  size_t source{false};
-
-  SSSP_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
-} SSSP_Usecase;
+struct SSSP_Usecase {
+  size_t source{0};
+  bool check_correctness{true};
+};
 
-class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
+template <typename input_usecase_t>
+class Tests_SSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, input_usecase_t>> {
  public:
   Tests_SSSP() {}
   static void SetupTestCase() {}
@@ -102,122 +105,230 @@ class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
   virtual void TearDown() {}
 
   template <typename vertex_t, typename edge_t, typename weight_t>
-  void run_current_test(SSSP_Usecase const& configuration)
+  void run_current_test(SSSP_Usecase const& sssp_usecase, input_usecase_t const& input_usecase)
   {
+    constexpr bool renumber = true;
+
     raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    auto [graph, d_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, true, renumber);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
 
-    auto graph =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
-        handle, configuration.graph_file_full_path, true);
     auto graph_view = graph.view();
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_weights(graph_view.get_number_of_edges());
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    raft::update_host(h_weights.data(),
-                      graph_view.weights(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    ASSERT_TRUE(configuration.source >= 0 &&
-                configuration.source <= graph_view.get_number_of_vertices())
-      << "Starting sources should be >= 0 and"
-      << " less than the number of vertices in the graph.";
-
-    std::vector<weight_t> h_reference_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
-
-    sssp_reference(h_offsets.data(),
-                   h_indices.data(),
-                   h_weights.data(),
-                   h_reference_distances.data(),
-                   h_reference_predecessors.data(),
-                   graph_view.get_number_of_vertices(),
-                   static_cast<vertex_t>(configuration.source));
+    ASSERT_TRUE(static_cast<vertex_t>(sssp_usecase.source) >= 0 &&
+                static_cast<vertex_t>(sssp_usecase.source) < graph_view.get_number_of_vertices());
 
     rmm::device_uvector<weight_t> d_distances(graph_view.get_number_of_vertices(),
                                               handle.get_stream());
     rmm::device_uvector<vertex_t> d_predecessors(graph_view.get_number_of_vertices(),
                                                  handle.get_stream());
 
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
 
     cugraph::experimental::sssp(handle,
                                 graph_view,
-                                d_distances.begin(),
-                                d_predecessors.begin(),
-                                static_cast<vertex_t>(configuration.source),
+                                d_distances.data(),
+                                d_predecessors.data(),
+                                static_cast<vertex_t>(sssp_usecase.source),
                                 std::numeric_limits<weight_t>::max(),
                                 false);
 
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    std::vector<weight_t> h_cugraph_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
-
-    raft::update_host(
-      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
-    raft::update_host(h_cugraph_predecessors.data(),
-                      d_predecessors.data(),
-                      d_predecessors.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
-    auto epsilon            = *max_weight_element * weight_t{1e-6};
-    auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
-
-    ASSERT_TRUE(std::equal(h_reference_distances.begin(),
-                           h_reference_distances.end(),
-                           h_cugraph_distances.begin(),
-                           nearly_equal))
-      << "distances do not match with the reference values.";
-
-    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
-      auto i = std::distance(h_cugraph_predecessors.begin(), it);
-      if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
-        ASSERT_TRUE(h_reference_predecessors[i] == *it)
-          << "vertex reachability do not match with the reference.";
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "SSSP took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    if (sssp_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, false, false>(
+            handle, true, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      std::vector<weight_t> h_weights(unrenumbered_graph_view.get_number_of_edges());
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.get_matrix_partition_view().get_indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      raft::update_host(h_weights.data(),
+                        *(unrenumbered_graph_view.get_matrix_partition_view().get_weights()),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto unrenumbered_source = static_cast<vertex_t>(sssp_usecase.source);
+      if (renumber) {
+        std::vector<vertex_t> h_renumber_map_labels((*d_renumber_map_labels).size());
+        raft::update_host(h_renumber_map_labels.data(),
+                          (*d_renumber_map_labels).data(),
+                          (*d_renumber_map_labels).size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        unrenumbered_source = h_renumber_map_labels[sssp_usecase.source];
+      }
+
+      std::vector<weight_t> h_reference_distances(unrenumbered_graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_reference_predecessors(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      sssp_reference(h_offsets.data(),
+                     h_indices.data(),
+                     h_weights.data(),
+                     h_reference_distances.data(),
+                     h_reference_predecessors.data(),
+                     unrenumbered_graph_view.get_number_of_vertices(),
+                     unrenumbered_source,
+                     std::numeric_limits<weight_t>::max());
+
+      std::vector<weight_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+      if (renumber) {
+        cugraph::experimental::unrenumber_local_int_vertices(handle,
+                                                             d_predecessors.data(),
+                                                             d_predecessors.size(),
+                                                             (*d_renumber_map_labels).data(),
+                                                             vertex_t{0},
+                                                             graph_view.get_number_of_vertices(),
+                                                             true);
+
+        rmm::device_uvector<weight_t> d_unrenumbered_distances(size_t{0}, handle.get_stream());
+        std::tie(std::ignore, d_unrenumbered_distances) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_distances.data(),
+                                     (*d_renumber_map_labels).size());
+        rmm::device_uvector<vertex_t> d_unrenumbered_predecessors(size_t{0}, handle.get_stream());
+        std::tie(std::ignore, d_unrenumbered_predecessors) =
+          cugraph::test::sort_by_key(handle,
+                                     (*d_renumber_map_labels).data(),
+                                     d_predecessors.data(),
+                                     (*d_renumber_map_labels).size());
+
+        raft::update_host(h_cugraph_distances.data(),
+                          d_unrenumbered_distances.data(),
+                          d_unrenumbered_distances.size(),
+                          handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_unrenumbered_predecessors.data(),
+                          d_unrenumbered_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
       } else {
-        auto pred_distance = h_reference_distances[*it];
-        bool found{false};
-        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
-          if (h_indices[j] == i) {
-            if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) {
-              found = true;
-              break;
+        raft::update_host(
+          h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_predecessors.data(),
+                          d_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+      }
+
+      auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
+      auto epsilon            = *max_weight_element * weight_t{1e-6};
+      auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
+
+      ASSERT_TRUE(std::equal(h_reference_distances.begin(),
+                             h_reference_distances.end(),
+                             h_cugraph_distances.begin(),
+                             nearly_equal))
+        << "distances do not match with the reference values.";
+
+      for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+        auto i = std::distance(h_cugraph_predecessors.begin(), it);
+        if (*it == cugraph::experimental::invalid_vertex_id<vertex_t>::value) {
+          ASSERT_TRUE(h_reference_predecessors[i] == *it)
+            << "vertex reachability do not match with the reference.";
+        } else {
+          auto pred_distance = h_reference_distances[*it];
+          bool found{false};
+          for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+            if (h_indices[j] == i) {
+              if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) {
+                found = true;
+                break;
+              }
             }
           }
+          ASSERT_TRUE(found)
+            << "no edge from the predecessor vertex to this vertex with the matching weight.";
         }
-        ASSERT_TRUE(found)
-          << "no edge from the predecessor vertex to this vertex with the matching weight.";
       }
     }
   }
 };
 
+using Tests_SSSP_File = Tests_SSSP<cugraph::test::File_Usecase>;
+using Tests_SSSP_Rmat = Tests_SSSP<cugraph::test::Rmat_Usecase>;
+
 // FIXME: add tests for type combinations
-TEST_P(Tests_SSSP, CheckInt32Int32Float) { run_current_test<int32_t, int32_t, float>(GetParam()); }
-
-#if 0
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_SSSP,
-                        ::testing::Values(SSSP_Usecase("test/datasets/karate.mtx", 0)));
-#else
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_SSSP,
-                        ::testing::Values(SSSP_Usecase("test/datasets/karate.mtx", 0),
-                                          SSSP_Usecase("test/datasets/dblp.mtx", 0),
-                                          SSSP_Usecase("test/datasets/wiki2003.mtx", 1000)));
-#endif
+TEST_P(Tests_SSSP_File, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+TEST_P(Tests_SSSP_Rmat, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SSSP_File,
+  // enable correctness checks
+  ::testing::Values(
+    std::make_tuple(SSSP_Usecase{0}, cugraph::test::File_Usecase("test/datasets/karate.mtx")),
+    std::make_tuple(SSSP_Usecase{0}, cugraph::test::File_Usecase("test/datasets/dblp.mtx")),
+    std::make_tuple(SSSP_Usecase{1000},
+                    cugraph::test::File_Usecase("test/datasets/wiki2003.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SSSP_Rmat,
+  // enable correctness checks
+  ::testing::Values(std::make_tuple(
+    SSSP_Usecase{0}, cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_SSSP_Rmat,
+  // disable correctness checks for large graphs
+  ::testing::Values(
+    std::make_tuple(SSSP_Usecase{0, false},
+                    cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/streams.cu b/cpp/tests/experimental/streams.cu
new file mode 100644
index 00000000000..c89ffe1e532
--- /dev/null
+++ b/cpp/tests/experimental/streams.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cudart_utils.h>
+#include <thrust/transform.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include "gtest/gtest.h"
+struct StreamTest : public ::testing::Test {
+};
+TEST_F(StreamTest, basic_test)
+{
+  int n_streams = 4;
+  raft::handle_t handle(n_streams);
+
+  const size_t intput_size = 4096;
+
+#pragma omp parallel for
+  for (int i = 0; i < n_streams; i++) {
+    rmm::device_uvector<int> u(intput_size, handle.get_internal_stream_view(i)),
+      v(intput_size, handle.get_internal_stream_view(i));
+    thrust::transform(rmm::exec_policy(handle.get_internal_stream_view(i)),
+                      u.begin(),
+                      u.end(),
+                      v.begin(),
+                      v.begin(),
+                      2 * thrust::placeholders::_1 + thrust::placeholders::_2);
+  }
+}
\ No newline at end of file
diff --git a/cpp/tests/experimental/weight_sum_test.cpp b/cpp/tests/experimental/weight_sum_test.cpp
new file mode 100644
index 00000000000..70c42da6136
--- /dev/null
+++ b/cpp/tests/experimental/weight_sum_test.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void weight_sum_reference(edge_t const* offsets,
+                          vertex_t const* indices,
+                          weight_t const* weights,
+                          weight_t* weight_sums,
+                          vertex_t num_vertices,
+                          bool major)
+{
+  if (!major) { std::fill(weight_sums, weight_sums + num_vertices, weight_t{0.0}); }
+  for (vertex_t i = 0; i < num_vertices; ++i) {
+    if (major) {
+      weight_sums[i] =
+        std::accumulate(weights + offsets[i], weights + offsets[i + 1], weight_t{0.0});
+    } else {
+      for (auto j = offsets[i]; j < offsets[i + 1]; ++j) {
+        auto nbr = indices[j];
+        weight_sums[nbr] += weights[j];
+      }
+    }
+  }
+
+  return;
+}
+
+typedef struct WeightSum_Usecase_t {
+  std::string graph_file_full_path{};
+
+  WeightSum_Usecase_t(std::string const& graph_file_path)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} WeightSum_Usecase;
+
+class Tests_WeightSum : public ::testing::TestWithParam<WeightSum_Usecase> {
+ public:
+  Tests_WeightSum() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(WeightSum_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> graph(
+      handle);
+    std::tie(graph, std::ignore) = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, configuration.graph_file_full_path, true, false);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_weights(graph_view.get_number_of_edges());
+    raft::update_host(h_offsets.data(),
+                      graph_view.get_matrix_partition_view().get_offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.get_matrix_partition_view().get_indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    raft::update_host(h_weights.data(),
+                      *(graph_view.get_matrix_partition_view().get_weights()),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::vector<weight_t> h_reference_in_weight_sums(graph_view.get_number_of_vertices());
+    std::vector<weight_t> h_reference_out_weight_sums(graph_view.get_number_of_vertices());
+
+    weight_sum_reference(h_offsets.data(),
+                         h_indices.data(),
+                         h_weights.data(),
+                         h_reference_in_weight_sums.data(),
+                         graph_view.get_number_of_vertices(),
+                         store_transposed);
+
+    weight_sum_reference(h_offsets.data(),
+                         h_indices.data(),
+                         h_weights.data(),
+                         h_reference_out_weight_sums.data(),
+                         graph_view.get_number_of_vertices(),
+                         !store_transposed);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    auto d_in_weight_sums  = graph_view.compute_in_weight_sums(handle);
+    auto d_out_weight_sums = graph_view.compute_out_weight_sums(handle);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<weight_t> h_cugraph_in_weight_sums(graph_view.get_number_of_vertices());
+    std::vector<weight_t> h_cugraph_out_weight_sums(graph_view.get_number_of_vertices());
+
+    raft::update_host(h_cugraph_in_weight_sums.data(),
+                      d_in_weight_sums.data(),
+                      d_in_weight_sums.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_out_weight_sums.data(),
+                      d_out_weight_sums.data(),
+                      d_out_weight_sums.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    auto threshold_ratio     = weight_t{1e-4};
+    auto threshold_magnitude = std::numeric_limits<weight_t>::min();
+    auto nearly_equal        = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+    };
+
+    ASSERT_TRUE(std::equal(h_reference_in_weight_sums.begin(),
+                           h_reference_in_weight_sums.end(),
+                           h_cugraph_in_weight_sums.begin(),
+                           nearly_equal))
+      << "In-weight-sum values do not match with the reference values.";
+    ASSERT_TRUE(std::equal(h_reference_out_weight_sums.begin(),
+                           h_reference_out_weight_sums.end(),
+                           h_cugraph_out_weight_sums.begin(),
+                           nearly_equal))
+      << "Out-weight-sum values do not match with the reference values.";
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_WeightSum, CheckInt32Int32FloatTransposed)
+{
+  run_current_test<int32_t, int32_t, float, true>(GetParam());
+}
+
+TEST_P(Tests_WeightSum, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_WeightSum,
+                         ::testing::Values(WeightSum_Usecase("test/datasets/karate.mtx"),
+                                           WeightSum_Usecase("test/datasets/web-Google.mtx"),
+                                           WeightSum_Usecase("test/datasets/ljournal-2008.mtx"),
+                                           WeightSum_Usecase("test/datasets/webbase-1M.mtx")));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/generators/erdos_renyi_test.cpp b/cpp/tests/generators/erdos_renyi_test.cpp
new file mode 100644
index 00000000000..3606ce2ddef
--- /dev/null
+++ b/cpp/tests/generators/erdos_renyi_test.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/graph_generators.hpp>
+
+#include <thrust/sort.h>
+
+#include <gtest/gtest.h>
+
+struct GenerateErdosRenyiTest : public ::testing::Test {
+};
+
+template <typename vertex_t>
+void test_symmetric(std::vector<vertex_t>& h_src_v, std::vector<vertex_t>& h_dst_v)
+{
+  std::vector<vertex_t> reverse_src_v(h_src_v.size());
+  std::vector<vertex_t> reverse_dst_v(h_dst_v.size());
+
+  std::copy(h_src_v.begin(), h_src_v.end(), reverse_dst_v.begin());
+  std::copy(h_dst_v.begin(), h_dst_v.end(), reverse_src_v.begin());
+
+  thrust::sort(thrust::host,
+               thrust::make_zip_iterator(thrust::make_tuple(h_src_v.begin(), h_dst_v.begin())),
+               thrust::make_zip_iterator(thrust::make_tuple(h_src_v.end(), h_dst_v.end())));
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(reverse_src_v.begin(), reverse_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(reverse_src_v.end(), reverse_dst_v.end())));
+
+  EXPECT_EQ(reverse_src_v, h_src_v);
+  EXPECT_EQ(reverse_dst_v, h_dst_v);
+}
+
+template <typename vertex_t>
+void er_test(size_t num_vertices, float p)
+{
+  raft::handle_t handle;
+  rmm::device_uvector<vertex_t> d_src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst_v(0, handle.get_stream());
+
+  std::tie(d_src_v, d_dst_v) =
+    cugraph::generate_erdos_renyi_graph_edgelist_gnp<vertex_t>(handle, num_vertices, p, 0);
+
+  handle.get_stream_view().synchronize();
+
+  std::vector<vertex_t> h_src_v(d_src_v.size());
+  std::vector<vertex_t> h_dst_v(d_dst_v.size());
+
+  raft::update_host(h_src_v.data(), d_src_v.data(), d_src_v.size(), handle.get_stream());
+  raft::update_host(h_dst_v.data(), d_dst_v.data(), d_dst_v.size(), handle.get_stream());
+
+  handle.get_stream_view().synchronize();
+
+  float expected_edge_count = p * num_vertices * num_vertices;
+
+  ASSERT_GE(h_src_v.size(), static_cast<size_t>(expected_edge_count * 0.8));
+  ASSERT_LE(h_src_v.size(), static_cast<size_t>(expected_edge_count * 1.2));
+  ASSERT_EQ(std::count_if(h_src_v.begin(),
+                          h_src_v.end(),
+                          [n = static_cast<vertex_t>(num_vertices)](auto v) {
+                            return !cugraph::experimental::is_valid_vertex(n, v);
+                          }),
+            0);
+  ASSERT_EQ(std::count_if(h_dst_v.begin(),
+                          h_dst_v.end(),
+                          [n = static_cast<vertex_t>(num_vertices)](auto v) {
+                            return !cugraph::experimental::is_valid_vertex(n, v);
+                          }),
+            0);
+}
+
+TEST_F(GenerateErdosRenyiTest, ERTest)
+{
+  er_test<int32_t>(size_t{10}, float{0.1});
+  er_test<int32_t>(size_t{20}, float{0.1});
+  er_test<int32_t>(size_t{50}, float{0.1});
+  er_test<int32_t>(size_t{10000}, float{0.1});
+}
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/generators/generators_test.cpp b/cpp/tests/generators/generators_test.cpp
new file mode 100644
index 00000000000..11e63d81f36
--- /dev/null
+++ b/cpp/tests/generators/generators_test.cpp
@@ -0,0 +1,689 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/graph_generators.hpp>
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+#include <random>
+
+struct GeneratorsTest : public ::testing::Test {
+};
+
+TEST_F(GeneratorsTest, PathGraphTest)
+{
+  using vertex_t = int32_t;
+
+  std::vector<vertex_t> expected_src_v({0, 1, 2, 3});
+  std::vector<vertex_t> expected_dst_v({1, 2, 3, 4});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters({{5, 0}});
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::tie(src_v, dst_v) = cugraph::generate_path_graph_edgelist<vertex_t>(handle, parameters);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, Mesh2DGraphTest)
+{
+  using vertex_t = int32_t;
+
+  std::vector<vertex_t> expected_src_v({0,  1,  2,  4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18,
+                                        20, 21, 22, 0, 1, 2, 3, 8, 9,  10, 11, 16, 17, 18, 19});
+  std::vector<vertex_t> expected_dst_v({1,  2,  3,  5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19,
+                                        21, 22, 23, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t>> parameters(
+    {{4, 2, 0}, {4, 2, 8}, {4, 2, 16}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_2d_mesh_graph_edgelist<vertex_t>(handle, parameters);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, Mesh3DGraphTest)
+{
+  using vertex_t = int32_t;
+
+  std::vector<vertex_t> expected_src_v(
+    {0,  1,  3,  4,  6,  7,  9,  10, 12, 13, 15, 16, 18, 19, 21, 22, 24, 25, 27, 28, 30, 31, 33, 34,
+     36, 37, 39, 40, 42, 43, 45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 64, 66, 67, 69, 70,
+     72, 73, 75, 76, 78, 79, 0,  1,  2,  3,  4,  5,  9,  10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 23,
+     27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 54, 55, 56, 57, 58, 59,
+     63, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 77, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+     12, 13, 14, 15, 16, 17, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+     54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71});
+
+  std::vector<vertex_t> expected_dst_v(
+    {1,  2,  4,  5,  7,  8,  10, 11, 13, 14, 16, 17, 19, 20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35,
+     37, 38, 40, 41, 43, 44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 64, 65, 67, 68, 70, 71,
+     73, 74, 76, 77, 79, 80, 3,  4,  5,  6,  7,  8,  12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25, 26,
+     30, 31, 32, 33, 34, 35, 39, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 53, 57, 58, 59, 60, 61, 62,
+     66, 67, 68, 69, 70, 71, 75, 76, 77, 78, 79, 80, 9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+     21, 22, 23, 24, 25, 26, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+     63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80});
+
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t, vertex_t>> parameters(
+    {{3, 3, 3, 0}, {3, 3, 3, 27}, {3, 3, 3, 54}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_3d_mesh_graph_edgelist<vertex_t>(handle, parameters);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, CompleteGraphTestTriangles)
+{
+  using vertex_t = int32_t;
+
+  std::vector<vertex_t> expected_src_v({0, 0, 1, 3, 3, 4, 6, 6, 7});
+  std::vector<vertex_t> expected_dst_v({1, 2, 2, 4, 5, 5, 7, 8, 8});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters({{3, 0}, {3, 3}, {3, 6}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_complete_graph_edgelist<vertex_t>(handle, parameters);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, CompleteGraphTest5)
+{
+  using vertex_t = int32_t;
+
+  size_t num_vertices{5};
+  size_t num_graphs{3};
+
+  std::vector<vertex_t> expected_src_v({0, 0, 0, 0, 1, 1,  1,  2,  2,  3,  5,  5,  5,  5,  6,
+                                        6, 6, 7, 7, 8, 10, 10, 10, 10, 11, 11, 11, 12, 12, 13});
+  std::vector<vertex_t> expected_dst_v({1, 2, 3, 4, 2, 3,  4,  3,  4,  4,  6,  7,  8,  9,  7,
+                                        8, 9, 8, 9, 9, 11, 12, 13, 14, 12, 13, 14, 13, 14, 14});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters({{5, 0}, {5, 5}, {5, 10}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_complete_graph_edgelist<vertex_t>(handle, parameters);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, LineGraphTestSymmetric)
+{
+  using vertex_t = int32_t;
+
+  size_t num_vertices{5};
+  std::vector<vertex_t> expected_src_v({0, 1, 2, 3, 1, 2, 3, 4});
+  std::vector<vertex_t> expected_dst_v({1, 2, 3, 4, 0, 1, 2, 3});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters({{5, 0}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_path_graph_edgelist<vertex_t>(handle, parameters);
+  std::tie(src_v, dst_v, std::ignore) = cugraph::symmetrize_edgelist<vertex_t, float>(
+    handle, std::move(src_v), std::move(dst_v), std::nullopt);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, Mesh2DGraphTestSymmetric)
+{
+  using vertex_t = int32_t;
+
+  size_t x{4};
+  size_t y{2};
+  size_t num_graphs{3};
+
+  std::vector<vertex_t> expected_src_v({0,  1,  2,  4, 5, 6, 8, 9,  10, 12, 13, 14, 16, 17, 18,
+                                        20, 21, 22, 0, 1, 2, 3, 8,  9,  10, 11, 16, 17, 18, 19,
+                                        1,  2,  3,  5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19,
+                                        21, 22, 23, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23});
+  std::vector<vertex_t> expected_dst_v({1,  2,  3,  5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19,
+                                        21, 22, 23, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23,
+                                        0,  1,  2,  4, 5, 6, 8, 9,  10, 12, 13, 14, 16, 17, 18,
+                                        20, 21, 22, 0, 1, 2, 3, 8,  9,  10, 11, 16, 17, 18, 19});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t>> parameters(
+    {{4, 2, 0}, {4, 2, 8}, {4, 2, 16}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_2d_mesh_graph_edgelist<vertex_t>(handle, parameters);
+  std::tie(src_v, dst_v, std::ignore) = cugraph::symmetrize_edgelist<vertex_t, float>(
+    handle, std::move(src_v), std::move(dst_v), std::nullopt);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, Mesh3DGraphTestSymmetric)
+{
+  using vertex_t = int32_t;
+
+  size_t x{3};
+  size_t y{3};
+  size_t z{3};
+  size_t num_graphs{3};
+
+  std::vector<vertex_t> expected_src_v(
+    {0,  1,  3,  4,  6,  7,  9,  10, 12, 13, 15, 16, 18, 19, 21, 22, 24, 25, 27, 28, 30, 31, 33, 34,
+     36, 37, 39, 40, 42, 43, 45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 64, 66, 67, 69, 70,
+     72, 73, 75, 76, 78, 79, 0,  1,  2,  3,  4,  5,  9,  10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 23,
+     27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 54, 55, 56, 57, 58, 59,
+     63, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 77, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+     12, 13, 14, 15, 16, 17, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+     54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 1,  2,  4,  5,  7,  8,
+     10, 11, 13, 14, 16, 17, 19, 20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
+     46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 64, 65, 67, 68, 70, 71, 73, 74, 76, 77, 79, 80,
+     3,  4,  5,  6,  7,  8,  12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35,
+     39, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 53, 57, 58, 59, 60, 61, 62, 66, 67, 68, 69, 70, 71,
+     75, 76, 77, 78, 79, 80, 9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+     36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 63, 64, 65, 66, 67, 68,
+     69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80});
+
+  std::vector<vertex_t> expected_dst_v(
+    {1,  2,  4,  5,  7,  8,  10, 11, 13, 14, 16, 17, 19, 20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35,
+     37, 38, 40, 41, 43, 44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 64, 65, 67, 68, 70, 71,
+     73, 74, 76, 77, 79, 80, 3,  4,  5,  6,  7,  8,  12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25, 26,
+     30, 31, 32, 33, 34, 35, 39, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 53, 57, 58, 59, 60, 61, 62,
+     66, 67, 68, 69, 70, 71, 75, 76, 77, 78, 79, 80, 9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+     21, 22, 23, 24, 25, 26, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+     63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 0,  1,  3,  4,  6,  7,
+     9,  10, 12, 13, 15, 16, 18, 19, 21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
+     45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 64, 66, 67, 69, 70, 72, 73, 75, 76, 78, 79,
+     0,  1,  2,  3,  4,  5,  9,  10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 23, 27, 28, 29, 30, 31, 32,
+     36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
+     72, 73, 74, 75, 76, 77, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+     27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 54, 55, 56, 57, 58, 59,
+     60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71});
+
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t, vertex_t>> parameters(
+    {{3, 3, 3, 0}, {3, 3, 3, 27}, {3, 3, 3, 54}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_3d_mesh_graph_edgelist<vertex_t>(handle, parameters);
+  std::tie(src_v, dst_v, std::ignore) = cugraph::symmetrize_edgelist<vertex_t, float>(
+    handle, std::move(src_v), std::move(dst_v), std::nullopt);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, CompleteGraphTestTrianglesSymmetric)
+{
+  using vertex_t = int32_t;
+
+  size_t num_vertices{3};
+  size_t num_graphs{3};
+
+  std::vector<vertex_t> expected_src_v({0, 0, 1, 3, 3, 4, 6, 6, 7, 1, 2, 2, 4, 5, 5, 7, 8, 8});
+  std::vector<vertex_t> expected_dst_v({1, 2, 2, 4, 5, 5, 7, 8, 8, 0, 0, 1, 3, 3, 4, 6, 6, 7});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters({{3, 0}, {3, 3}, {3, 6}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_complete_graph_edgelist<vertex_t>(handle, parameters);
+  std::tie(src_v, dst_v, std::ignore) = cugraph::symmetrize_edgelist<vertex_t, float>(
+    handle, std::move(src_v), std::move(dst_v), std::nullopt);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, CompleteGraphTest5Symmetric)
+{
+  using vertex_t = int32_t;
+
+  size_t num_vertices{5};
+  size_t num_graphs{3};
+
+  std::vector<vertex_t> expected_src_v({0, 0, 0, 0, 1, 1,  1,  2,  2,  3,  5,  5,  5,  5,  6,
+                                        6, 6, 7, 7, 8, 10, 10, 10, 10, 11, 11, 11, 12, 12, 13,
+                                        1, 2, 3, 4, 2, 3,  4,  3,  4,  4,  6,  7,  8,  9,  7,
+                                        8, 9, 8, 9, 9, 11, 12, 13, 14, 12, 13, 14, 13, 14, 14});
+  std::vector<vertex_t> expected_dst_v({1, 2, 3, 4, 2, 3,  4,  3,  4,  4,  6,  7,  8,  9,  7,
+                                        8, 9, 8, 9, 9, 11, 12, 13, 14, 12, 13, 14, 13, 14, 14,
+                                        0, 0, 0, 0, 1, 1,  1,  2,  2,  3,  5,  5,  5,  5,  6,
+                                        6, 6, 7, 7, 8, 10, 10, 10, 10, 11, 11, 11, 12, 12, 13});
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  raft::handle_t handle;
+
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters({{5, 0}, {5, 5}, {5, 10}});
+
+  std::tie(src_v, dst_v) = cugraph::generate_complete_graph_edgelist<vertex_t>(handle, parameters);
+  std::tie(src_v, dst_v, std::ignore) = cugraph::symmetrize_edgelist<vertex_t, float>(
+    handle, std::move(src_v), std::move(dst_v), std::nullopt);
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(src_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, CombineGraphsTest)
+{
+  using vertex_t = int32_t;
+  using weight_t = float;
+
+  raft::handle_t handle;
+
+  size_t num_vertices{8};
+
+  std::vector<vertex_t> expected_src_v({0,  1,  2,  3, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18,
+                                        20, 21, 22, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19});
+  std::vector<vertex_t> expected_dst_v({1,  2,  3,  4, 5, 6, 7, 9,  10, 11, 13, 14, 15, 17, 18, 19,
+                                        21, 22, 23, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23});
+
+  rmm::device_uvector<vertex_t> src_graph_1_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_graph_1_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> src_graph_2_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_graph_2_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters1({{num_vertices, 0}});
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t>> parameters2(
+    {{4, 2, 0}, {4, 2, 8}, {4, 2, 16}});
+
+  std::tie(src_graph_1_v, dst_graph_1_v) =
+    cugraph::generate_path_graph_edgelist<vertex_t>(handle, parameters1);
+  std::tie(src_graph_2_v, dst_graph_2_v) =
+    cugraph::generate_2d_mesh_graph_edgelist<vertex_t>(handle, parameters2);
+
+  std::vector<rmm::device_uvector<vertex_t>> sources;
+  sources.push_back(std::move(src_graph_1_v));
+  sources.push_back(std::move(src_graph_2_v));
+
+  std::vector<rmm::device_uvector<vertex_t>> dests;
+  dests.push_back(std::move(dst_graph_1_v));
+  dests.push_back(std::move(dst_graph_2_v));
+
+  std::tie(src_v, dst_v, std::ignore) = cugraph::combine_edgelists<vertex_t, weight_t>(
+    handle, std::move(sources), std::move(dests), std::nullopt);
+
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(dst_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, CombineGraphsOffsetsTest)
+{
+  using vertex_t = int32_t;
+  using weight_t = float;
+
+  raft::handle_t handle;
+
+  size_t num_vertices{8};
+  vertex_t offset{10};
+
+  std::vector<vertex_t> expected_src_v({0,  1,  2,  3,  4,  5,  6,  10, 11, 12, 14, 15, 16,
+                                        18, 19, 20, 22, 23, 24, 26, 27, 28, 30, 31, 32, 10,
+                                        11, 12, 13, 18, 19, 20, 21, 26, 27, 28, 29});
+  std::vector<vertex_t> expected_dst_v({1,  2,  3,  4,  5,  6,  7,  11, 12, 13, 15, 16, 17,
+                                        19, 20, 21, 23, 24, 25, 27, 28, 29, 31, 32, 33, 14,
+                                        15, 16, 17, 22, 23, 24, 25, 30, 31, 32, 33});
+
+  rmm::device_uvector<vertex_t> src_graph_1_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_graph_1_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> src_graph_2_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_graph_2_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+
+  std::vector<std::tuple<vertex_t, vertex_t>> parameters1({{num_vertices, 0}});
+  std::vector<std::tuple<vertex_t, vertex_t, vertex_t>> parameters2(
+    {{4, 2, 10}, {4, 2, 18}, {4, 2, 26}});
+
+  std::tie(src_graph_1_v, dst_graph_1_v) =
+    cugraph::generate_path_graph_edgelist<vertex_t>(handle, parameters1);
+  std::tie(src_graph_2_v, dst_graph_2_v) =
+    cugraph::generate_2d_mesh_graph_edgelist<vertex_t>(handle, parameters2);
+
+  std::vector<rmm::device_uvector<vertex_t>> sources;
+  sources.push_back(std::move(src_graph_1_v));
+  sources.push_back(std::move(src_graph_2_v));
+
+  std::vector<rmm::device_uvector<vertex_t>> dests;
+  dests.push_back(std::move(dst_graph_1_v));
+  dests.push_back(std::move(dst_graph_2_v));
+
+  std::tie(src_v, dst_v, std::ignore) = cugraph::combine_edgelists<vertex_t, weight_t>(
+    handle, std::move(sources), std::move(dests), std::nullopt);
+
+  std::vector<vertex_t> actual_src_v;
+  std::vector<vertex_t> actual_dst_v;
+
+  actual_src_v.resize(src_v.size());
+  actual_dst_v.resize(dst_v.size());
+
+  raft::update_host(actual_src_v.data(), src_v.data(), src_v.size(), handle.get_stream());
+  raft::update_host(actual_dst_v.data(), dst_v.data(), dst_v.size(), handle.get_stream());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected_src_v.begin(), expected_dst_v.begin())) +
+      expected_src_v.size());
+
+  thrust::sort(
+    thrust::host,
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(actual_src_v.begin(), actual_dst_v.begin())) +
+      actual_src_v.size());
+
+  EXPECT_EQ(expected_src_v, actual_src_v);
+  EXPECT_EQ(expected_dst_v, actual_dst_v);
+}
+
+TEST_F(GeneratorsTest, ScrambleTest)
+{
+  using vertex_t = int32_t;
+  using edge_t   = int32_t;
+
+  edge_t num_vertices{30};
+  edge_t num_edges{100};
+
+  raft::handle_t handle;
+
+  std::vector<vertex_t> input_src_v(num_edges);
+  std::vector<vertex_t> input_dst_v(num_edges);
+
+  std::default_random_engine generator{};
+  std::uniform_int_distribution<vertex_t> distribution{0, num_vertices - 1};
+
+  std::generate(input_src_v.begin(), input_src_v.end(), [&distribution, &generator]() {
+    return distribution(generator);
+  });
+  std::generate(input_dst_v.begin(), input_dst_v.end(), [&distribution, &generator]() {
+    return distribution(generator);
+  });
+
+  rmm::device_uvector<vertex_t> d_src_v(input_src_v.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst_v(input_src_v.size(), handle.get_stream());
+  std::vector<vertex_t> output_src_v(input_src_v.size());
+  std::vector<vertex_t> output_dst_v(input_src_v.size());
+
+  raft::update_device(d_src_v.data(), input_src_v.data(), input_src_v.size(), handle.get_stream());
+  raft::update_device(d_dst_v.data(), input_dst_v.data(), input_dst_v.size(), handle.get_stream());
+
+  cugraph::scramble_vertex_ids(handle, d_src_v, d_dst_v, 5, 0);
+
+  raft::update_host(output_src_v.data(), d_src_v.data(), d_src_v.size(), handle.get_stream());
+  raft::update_host(output_dst_v.data(), d_dst_v.data(), d_dst_v.size(), handle.get_stream());
+
+  EXPECT_TRUE(cugraph::test::renumbered_vectors_same(handle, input_src_v, output_src_v));
+  EXPECT_TRUE(cugraph::test::renumbered_vectors_same(handle, input_dst_v, output_dst_v));
+}
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/layout/force_atlas2_test.cu b/cpp/tests/layout/force_atlas2_test.cu
index d564765d0df..e736f2d2db5 100644
--- a/cpp/tests/layout/force_atlas2_test.cu
+++ b/cpp/tests/layout/force_atlas2_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -17,8 +17,8 @@
 #include <utilities/test_utilities.hpp>
 
 #include <layout/trust_worthiness.h>
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/error.hpp>
@@ -111,9 +111,10 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam<Force_Atlas2_Usecase>
     std::vector<std::vector<int>> adj_matrix(m, std::vector<int>(m));
     std::vector<float> force_atlas2(m * 2);
 
+    raft::handle_t const handle;
+    auto stream = handle.get_stream();
     // device alloc
-    rmm::device_vector<float> force_atlas2_vector(m * 2);
-    float* d_force_atlas2 = force_atlas2_vector.data().get();
+    rmm::device_uvector<float> pos(m * 2, stream);
 
     // Read
     ASSERT_EQ((cugraph::test::mm_to_coo<int, T>(
@@ -131,19 +132,19 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam<Force_Atlas2_Usecase>
     }
 
     // Allocate COO on device
-    rmm::device_vector<int> srcs_v(nnz);
-    rmm::device_vector<int> dests_v(nnz);
-    rmm::device_vector<T> weights_v(nnz);
+    rmm::device_uvector<int> srcs_v(nnz, stream);
+    rmm::device_uvector<int> dests_v(nnz, stream);
+    rmm::device_uvector<T> weights_v(nnz, stream);
 
-    int* srcs  = srcs_v.data().get();
-    int* dests = dests_v.data().get();
-    T* weights = weights_v.data().get();
+    int* srcs  = srcs_v.data();
+    int* dests = dests_v.data();
+    T* weights = weights_v.data();
 
     // FIXME: RAFT error handling mechanism should be used instead
     CUDA_TRY(cudaMemcpy(srcs, &cooRowInd[0], sizeof(int) * nnz, cudaMemcpyDefault));
     CUDA_TRY(cudaMemcpy(dests, &cooColInd[0], sizeof(int) * nnz, cudaMemcpyDefault));
     CUDA_TRY(cudaMemcpy(weights, &cooVal[0], sizeof(T) * nnz, cudaMemcpyDefault));
-    cugraph::GraphCOOView<int, int, T> G(srcs, dests, weights, m, nnz);
+    cugraph::legacy::GraphCOOView<int, int, T> G(srcs, dests, weights, m, nnz);
 
     const int max_iter                    = 500;
     float* x_start                        = nullptr;
@@ -163,8 +164,9 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam<Force_Atlas2_Usecase>
     if (PERF) {
       hr_clock.start();
       for (int i = 0; i < PERF_MULTIPLIER; ++i) {
-        cugraph::force_atlas2<int, int, T>(G,
-                                           d_force_atlas2,
+        cugraph::force_atlas2<int, int, T>(handle,
+                                           G,
+                                           pos.data(),
                                            max_iter,
                                            x_start,
                                            y_start,
@@ -185,8 +187,9 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam<Force_Atlas2_Usecase>
       force_atlas2_time.push_back(time_tmp);
     } else {
       cudaProfilerStart();
-      cugraph::force_atlas2<int, int, T>(G,
-                                         d_force_atlas2,
+      cugraph::force_atlas2<int, int, T>(handle,
+                                         G,
+                                         pos.data(),
                                          max_iter,
                                          x_start,
                                          y_start,
@@ -207,12 +210,13 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam<Force_Atlas2_Usecase>
 
     // Copy pos to host
     std::vector<float> h_pos(m * 2);
-    CUDA_TRY(cudaMemcpy(&h_pos[0], d_force_atlas2, sizeof(float) * m * 2, cudaMemcpyDeviceToHost));
+    CUDA_TRY(cudaMemcpy(&h_pos[0], pos.data(), sizeof(float) * m * 2, cudaMemcpyDeviceToHost));
 
     // Transpose the data
     std::vector<std::vector<double>> C_contiguous_embedding(m, std::vector<double>(2));
     for (int i = 0; i < m; i++) {
-      for (int j = 0; j < 2; j++) C_contiguous_embedding[i][j] = h_pos[j * m + i];
+      for (int j = 0; j < 2; j++)
+        C_contiguous_embedding[i][j] = h_pos[j * m + i];
     }
 
     // Test trustworthiness
@@ -229,12 +233,12 @@ TEST_P(Tests_Force_Atlas2, CheckFP32_T) { run_current_test<float>(GetParam()); }
 TEST_P(Tests_Force_Atlas2, CheckFP64_T) { run_current_test<double>(GetParam()); }
 
 // --gtest_filter=*simple_test*
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_Force_Atlas2,
-                        ::testing::Values(Force_Atlas2_Usecase("test/datasets/karate.mtx", 0.73),
-                                          Force_Atlas2_Usecase("test/datasets/dolphins.mtx", 0.69),
-                                          Force_Atlas2_Usecase("test/datasets/polbooks.mtx", 0.76),
-                                          Force_Atlas2_Usecase("test/datasets/netscience.mtx",
-                                                               0.80)));
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_Force_Atlas2,
+                         ::testing::Values(Force_Atlas2_Usecase("test/datasets/karate.mtx", 0.73),
+                                           Force_Atlas2_Usecase("test/datasets/dolphins.mtx", 0.69),
+                                           Force_Atlas2_Usecase("test/datasets/polbooks.mtx", 0.76),
+                                           Force_Atlas2_Usecase("test/datasets/netscience.mtx",
+                                                                0.80)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/layout/knn.h b/cpp/tests/layout/knn.h
index 07d07528769..26666794896 100644
--- a/cpp/tests/layout/knn.h
+++ b/cpp/tests/layout/knn.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,8 @@ double sq_euclid_dist(const point& x, const point& y)
   double total = 0;
   auto i       = x.attributes.begin();
   auto j       = y.attributes.begin();
-  for (; i != x.attributes.end() && j != y.attributes.end(); ++i, ++j) total += pow(*i - *j, 2);
+  for (; i != x.attributes.end() && j != y.attributes.end(); ++i, ++j)
+    total += pow(*i - *j, 2);
   return total;
 }
 
@@ -63,6 +64,7 @@ std::vector<int> knn_classify(std::list<point>& dataframe, const point& c, const
   auto count = 0;
   auto j     = distances.begin();
   ++j;
-  for (; j != distances.end() && count < k; ++j, ++count) res.push_back(j->index);
+  for (; j != distances.end() && count < k; ++j, ++count)
+    res.push_back(j->index);
   return res;
 }
diff --git a/cpp/tests/layout/trust_worthiness.h b/cpp/tests/layout/trust_worthiness.h
index 40c9782a76e..5a112ea3c6b 100644
--- a/cpp/tests/layout/trust_worthiness.h
+++ b/cpp/tests/layout/trust_worthiness.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,8 @@ double euclidian_dist(const std::vector<int>& x, const std::vector<int>& y)
   double total = 0;
   auto i       = x.begin();
   auto j       = y.begin();
-  for (; i != x.end() && j != y.end(); ++i, ++j) total += pow(*i, 2) - 2 * *i * *j + pow(*j, 2);
+  for (; i != x.end() && j != y.end(); ++i, ++j)
+    total += pow(*i, 2) - 2 * *i * *j + pow(*j, 2);
   return sqrt(total);
 }
 
@@ -62,7 +63,8 @@ std::vector<int> argsort(Iter begin, Iter end, Compare comp)
                      return comp(*prev.second, *next.second);
                    });
 
-  for (auto i : pairList) ret.push_back(i.first);
+  for (auto i : pairList)
+    ret.push_back(i.first);
 
   return ret;
 }
@@ -109,7 +111,8 @@ double compute_rank(const std::vector<std::vector<int>>& ind_X,
         ranks[j] = idx;
       }
     }
-    for (auto& val : ranks) val -= k;
+    for (auto& val : ranks)
+      val -= k;
 
     for (const auto& val : ranks)
       if (val > 0) rank += val;
@@ -122,7 +125,9 @@ void print_matrix(const std::vector<std::vector<T>>& matrix)
 {
   for (size_t i = 0; i < matrix.size(); ++i) {
     std::cout << "[ ";
-    for (size_t j = 0; j < matrix[i].size(); ++j) { std::cout << matrix[i][j] << ' '; }
+    for (size_t j = 0; j < matrix[i].size(); ++j) {
+      std::cout << matrix[i][j] << ' ';
+    }
     std::cout << "]\n";
   }
 }
diff --git a/cpp/tests/linear_assignment/hungarian_test.cu b/cpp/tests/linear_assignment/hungarian_test.cu
index 656957a85eb..f806a217a8f 100644
--- a/cpp/tests/linear_assignment/hungarian_test.cu
+++ b/cpp/tests/linear_assignment/hungarian_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -9,29 +9,31 @@
  *
  */
 
-#include "cuda_profiler_api.h"
-#include "gtest/gtest.h"
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <thrust/random.h>
+#include <utilities/high_res_timer.hpp>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <raft/handle.hpp>
 
-#include <utilities/high_res_timer.hpp>
-
 #include <curand_kernel.h>
 
-__global__ void setup_generator(curandState *state)
+#include <rmm/device_uvector.hpp>
+
+#include "cuda_profiler_api.h"
+#include "gtest/gtest.h"
+
+#include <thrust/equal.h>
+#include <thrust/random.h>
+
+__global__ void setup_generator(curandState* state)
 {
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   curand_init(43, id, 0, &state[id]);
 }
 
 template <typename T>
-__global__ void generate_random(curandState *state, int n, T *data, int32_t upper_bound)
+__global__ void generate_random(curandState* state, int n, T* data, int32_t upper_bound)
 {
   int first  = threadIdx.x + blockIdx.x * blockDim.x;
   int stride = blockDim.x * gridDim.x;
@@ -64,29 +66,35 @@ TEST_F(HungarianTest, Bipartite4x4)
 
   int32_t workers[] = {0, 1, 2, 3};
 
-  float min_cost     = 18.0;
-  int32_t expected[] = {6, 7, 5, 4};
+  float min_cost = 18.0;
+  std::vector<int32_t> expected({6, 7, 5, 4});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
 
   int32_t length         = sizeof(src_data) / sizeof(src_data[0]);
   int32_t length_workers = sizeof(workers) / sizeof(workers[0]);
   int32_t num_vertices   = 1 + std::max(*std::max_element(src_data, src_data + length),
                                       *std::max_element(dst_data, dst_data + length));
 
-  rmm::device_vector<int32_t> src_v(src_data, src_data + length);
-  rmm::device_vector<int32_t> dst_v(dst_data, dst_data + length);
-  rmm::device_vector<float> cost_v(cost, cost + length);
-  rmm::device_vector<int32_t> workers_v(workers, workers + length_workers);
-  rmm::device_vector<int32_t> expected_v(expected, expected + length_workers);
-  rmm::device_vector<int32_t> assignment_v(length_workers);
+  rmm::device_uvector<int32_t> src_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> dst_v(length, handle.get_stream_view());
+  rmm::device_uvector<float> cost_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> workers_v(length_workers, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(length_workers, handle.get_stream_view());
+
+  raft::update_device(src_v.begin(), src_data, length, handle.get_stream());
+  raft::update_device(dst_v.begin(), dst_data, length, handle.get_stream());
+  raft::update_device(cost_v.begin(), cost, length, handle.get_stream());
+  raft::update_device(workers_v.begin(), workers, length_workers, handle.get_stream());
+
+  cugraph::legacy::GraphCOOView<int32_t, int32_t, float> g(
+    src_v.data(), dst_v.data(), cost_v.data(), num_vertices, length);
 
-  cugraph::GraphCOOView<int32_t, int32_t, float> g(
-    src_v.data().get(), dst_v.data().get(), cost_v.data().get(), num_vertices, length);
+  float r = cugraph::hungarian(handle, g, length_workers, workers_v.data(), assignment_v.data());
 
-  float r = cugraph::hungarian(
-    handle, g, length_workers, workers_v.data().get(), assignment_v.data().get());
+  raft::update_host(assignment.data(), assignment_v.begin(), length_workers, handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
-  EXPECT_EQ(expected_v, assignment_v);
+  EXPECT_EQ(assignment, expected);
 }
 
 TEST_F(HungarianTest, Bipartite5x5)
@@ -100,29 +108,36 @@ TEST_F(HungarianTest, Bipartite5x5)
 
   int32_t workers[] = {0, 1, 2, 3, 4};
 
-  float min_cost     = 51.0;
-  int32_t expected[] = {5, 7, 8, 6, 9};
+  float min_cost = 51.0;
+  std::vector<int32_t> expected({5, 7, 8, 6, 9});
+  std::vector<int32_t> assignment({0, 0, 0, 0, 0});
 
   int32_t length         = sizeof(src_data) / sizeof(src_data[0]);
   int32_t length_workers = sizeof(workers) / sizeof(workers[0]);
   int32_t num_vertices   = 1 + std::max(*std::max_element(src_data, src_data + length),
                                       *std::max_element(dst_data, dst_data + length));
 
-  rmm::device_vector<int32_t> src_v(src_data, src_data + length);
-  rmm::device_vector<int32_t> dst_v(dst_data, dst_data + length);
-  rmm::device_vector<float> cost_v(cost, cost + length);
-  rmm::device_vector<int32_t> workers_v(workers, workers + length_workers);
-  rmm::device_vector<int32_t> expected_v(expected, expected + length_workers);
-  rmm::device_vector<int32_t> assignment_v(length_workers);
+  rmm::device_uvector<int32_t> src_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> dst_v(length, handle.get_stream_view());
+  rmm::device_uvector<float> cost_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> workers_v(length_workers, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(length_workers, handle.get_stream_view());
 
-  cugraph::GraphCOOView<int32_t, int32_t, float> g(
-    src_v.data().get(), dst_v.data().get(), cost_v.data().get(), num_vertices, length);
+  raft::update_device(src_v.begin(), src_data, length, handle.get_stream());
+  raft::update_device(dst_v.begin(), dst_data, length, handle.get_stream());
+  raft::update_device(cost_v.begin(), cost, length, handle.get_stream());
+  raft::update_device(workers_v.begin(), workers, length_workers, handle.get_stream());
 
-  float r = cugraph::hungarian(
-    handle, g, length_workers, workers_v.data().get(), assignment_v.data().get());
+  cugraph::legacy::GraphCOOView<int32_t, int32_t, float> g(
+    src_v.data(), dst_v.data(), cost_v.data(), num_vertices, length);
+
+  float r = cugraph::hungarian(handle, g, length_workers, workers_v.data(), assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.begin(), assignment_v.size(), handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
-  EXPECT_EQ(expected_v, assignment_v);
+  EXPECT_EQ(assignment, expected);
 }
 
 TEST_F(HungarianTest, Bipartite4x4_multiple_answers)
@@ -135,40 +150,44 @@ TEST_F(HungarianTest, Bipartite4x4_multiple_answers)
 
   int32_t workers[] = {0, 1, 2, 3};
 
-  float min_cost      = 13.0;
-  int32_t expected1[] = {7, 6, 5, 4};
-  int32_t expected2[] = {6, 7, 5, 4};
-  int32_t expected3[] = {7, 6, 4, 5};
-  int32_t expected4[] = {6, 7, 4, 5};
+  float min_cost = 13.0;
+
+  std::vector<int32_t> expected1({7, 6, 5, 4});
+  std::vector<int32_t> expected2({6, 7, 5, 4});
+  std::vector<int32_t> expected3({7, 6, 4, 5});
+  std::vector<int32_t> expected4({6, 7, 4, 5});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
 
   int32_t length         = sizeof(src_data) / sizeof(src_data[0]);
   int32_t length_workers = sizeof(workers) / sizeof(workers[0]);
   int32_t num_vertices   = 1 + std::max(*std::max_element(src_data, src_data + length),
                                       *std::max_element(dst_data, dst_data + length));
 
-  rmm::device_vector<int32_t> src_v(src_data, src_data + length);
-  rmm::device_vector<int32_t> dst_v(dst_data, dst_data + length);
-  rmm::device_vector<float> cost_v(cost, cost + length);
-  rmm::device_vector<int32_t> workers_v(workers, workers + length_workers);
-  rmm::device_vector<int32_t> assignment_v(length_workers);
+  rmm::device_uvector<int32_t> src_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> dst_v(length, handle.get_stream_view());
+  rmm::device_uvector<float> cost_v(length, handle.get_stream_view());
+  rmm::device_uvector<int32_t> workers_v(length_workers, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(length_workers, handle.get_stream_view());
 
-  rmm::device_vector<int32_t> expected1_v(expected1, expected1 + length_workers);
-  rmm::device_vector<int32_t> expected2_v(expected2, expected2 + length_workers);
-  rmm::device_vector<int32_t> expected3_v(expected3, expected3 + length_workers);
-  rmm::device_vector<int32_t> expected4_v(expected4, expected4 + length_workers);
+  raft::update_device(src_v.begin(), src_data, length, handle.get_stream());
+  raft::update_device(dst_v.begin(), dst_data, length, handle.get_stream());
+  raft::update_device(cost_v.begin(), cost, length, handle.get_stream());
+  raft::update_device(workers_v.begin(), workers, length_workers, handle.get_stream());
 
-  cugraph::GraphCOOView<int32_t, int32_t, float> g(
-    src_v.data().get(), dst_v.data().get(), cost_v.data().get(), num_vertices, length);
+  cugraph::legacy::GraphCOOView<int32_t, int32_t, float> g(
+    src_v.data(), dst_v.data(), cost_v.data(), num_vertices, length);
 
-  float r = cugraph::hungarian(
-    handle, g, length_workers, workers_v.data().get(), assignment_v.data().get());
+  float r = cugraph::hungarian(handle, g, length_workers, workers_v.data(), assignment_v.data());
 
   EXPECT_EQ(min_cost, r);
 
-  EXPECT_TRUE(thrust::equal(assignment_v.begin(), assignment_v.end(), expected1_v.begin()) ||
-              thrust::equal(assignment_v.begin(), assignment_v.end(), expected2_v.begin()) ||
-              thrust::equal(assignment_v.begin(), assignment_v.end(), expected3_v.begin()) ||
-              thrust::equal(assignment_v.begin(), assignment_v.end(), expected4_v.begin()));
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
+
+  EXPECT_TRUE(std::equal(assignment.begin(), assignment.end(), expected1.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected2.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected3.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected4.begin()));
 }
 
 TEST_F(HungarianTest, May29InfLoop)
@@ -181,13 +200,82 @@ TEST_F(HungarianTest, May29InfLoop)
 
   float min_cost = 2;
 
-  rmm::device_vector<float> cost_v(cost, cost + num_rows * num_cols);
-  rmm::device_vector<int32_t> assignment_v(num_rows);
+  std::vector<int32_t> expected({3, 2, 1, 0});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
+
+  EXPECT_EQ(min_cost, r);
+  EXPECT_EQ(assignment, expected);
+}
+
+TEST_F(HungarianTest, Dense4x6)
+{
+  raft::handle_t handle{};
+
+  int32_t num_rows = 4;
+  int32_t num_cols = 6;
+  float cost[]     = {0,  16, 1,    0,    90, 100, 33, 45, 0,    4,    90, 100,
+                  22, 0,  1000, 2000, 90, 100, 2,  0,  3000, 4000, 90, 100};
+
+  float min_cost = 2;
+
+  std::vector<int32_t> expected({3, 2, 1, 0});
+  std::vector<int32_t> assignment({0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
 
-  float r = cugraph::dense::hungarian(
-    handle, cost_v.data().get(), num_rows, num_cols, assignment_v.data().get());
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
+  EXPECT_EQ(assignment, expected);
+}
+
+TEST_F(HungarianTest, Dense6x4)
+{
+  raft::handle_t handle{};
+
+  int32_t num_rows = 6;
+  int32_t num_cols = 4;
+  float cost[]     = {0,  16, 1,    0,    33, 45,  0,   4,   90, 100, 110,  120,
+                  22, 0,  1000, 2000, 90, 100, 110, 120, 2,  0,   3000, 4000};
+
+  float min_cost = 2;
+
+  std::vector<int32_t> expected1({3, 2, 4, 1, 5, 0});
+  std::vector<int32_t> expected2({3, 2, 5, 1, 4, 0});
+  std::vector<int32_t> assignment({0, 0, 0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
+
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
+
+  EXPECT_EQ(min_cost, r);
+  EXPECT_TRUE(std::equal(assignment.begin(), assignment.end(), expected1.begin()) ||
+              std::equal(assignment.begin(), assignment.end(), expected2.begin()));
 }
 
 TEST_F(HungarianTest, PythonTestFailure)
@@ -229,13 +317,22 @@ TEST_F(HungarianTest, PythonTestFailure)
 
   float min_cost = 16;
 
-  rmm::device_vector<float> cost_v(cost, cost + num_rows * num_cols);
-  rmm::device_vector<int32_t> assignment_v(num_rows);
+  std::vector<int32_t> expected({0, 2, 1, 4, 3});
+  std::vector<int32_t> assignment({0, 0, 0, 0, 0});
+
+  rmm::device_uvector<float> cost_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
+
+  raft::update_device(cost_v.begin(), cost, num_rows * num_cols, handle.get_stream());
+
+  float r =
+    cugraph::dense::hungarian(handle, cost_v.data(), num_rows, num_cols, assignment_v.data());
 
-  float r = cugraph::dense::hungarian(
-    handle, cost_v.data().get(), num_rows, num_cols, assignment_v.data().get());
+  raft::update_host(
+    assignment.data(), assignment_v.data(), assignment_v.size(), handle.get_stream());
 
   EXPECT_EQ(min_cost, r);
+  EXPECT_EQ(assignment, expected);
 }
 
 // FIXME:  Need to have tests with nxm (e.g. 4x5 and 5x4) to test those conditions
@@ -249,19 +346,19 @@ void random_test(int32_t num_rows, int32_t num_cols, int32_t upper_bound, int re
 
   HighResTimer hr_timer;
 
-  rmm::device_vector<int32_t>  data_v(num_rows * num_cols);
-  rmm::device_vector<curandState> state_vals_v(num_threads);
-  rmm::device_vector<int32_t> assignment_v(num_rows);
+  rmm::device_uvector<int32_t>  data_v(num_rows * num_cols, handle.get_stream_view());
+  rmm::device_uvector<curandState> state_vals_v(num_threads, handle.get_stream_view());
+  rmm::device_uvector<int32_t> assignment_v(num_rows, handle.get_stream_view());
 
   std::vector<int32_t> validate(num_cols);
 
   hr_timer.start("initialization");
 
   cudaStream_t stream{0};
-  int32_t *d_data = data_v.data().get();
+  int32_t *d_data = data_v.data();
   //int64_t seed{85};
   int64_t seed{time(nullptr)};
-  
+
   thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    thrust::make_counting_iterator<int32_t>(0),
                    thrust::make_counting_iterator<int32_t>(num_rows * num_cols),
@@ -280,15 +377,15 @@ void random_test(int32_t num_rows, int32_t num_cols, int32_t upper_bound, int re
 
   for (int i = 0 ; i < repetitions ; ++i) {
     hr_timer.start("hungarian");
-    r = cugraph::hungarian_dense(cost_v.data().get(), num_rows, num_cols, assignment_v.data().get());
+    r = cugraph::hungarian_dense(cost_v.data(), num_rows, num_cols, assignment_v.data());
     hr_timer.stop();
   }
 
   std::cout << "cost = " << r << std::endl;
   hr_timer.display(std::cout);
 
-  
-  for (int i = 0 ; i < num_cols ; ++i) 
+
+  for (int i = 0 ; i < num_cols ; ++i)
     validate[i] = 0;
 
   int32_t assignment_out_of_range{0};
@@ -303,8 +400,8 @@ void random_test(int32_t num_rows, int32_t num_cols, int32_t upper_bound, int re
 
   EXPECT_EQ(assignment_out_of_range, 0);
 
-  int32_t assignment_missed = 0; 
-  
+  int32_t assignment_missed = 0;
+
   for (int32_t i = 0 ; i < num_cols ; ++i) {
     if (validate[i] != 1) {
       ++assignment_missed;
diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp
new file mode 100644
index 00000000000..fca889c3299
--- /dev/null
+++ b/cpp/tests/pagerank/mg_pagerank_test.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+struct PageRank_Usecase {
+  double personalization_ratio{0.0};
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGPageRank
+  : public ::testing::TestWithParam<std::tuple<PageRank_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGPageRank() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running PageRank on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(PageRank_Usecase const& pagerank_usecase,
+                        input_usecase_t const& input_usecase)
+  {
+    // 1. initialize handle
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, true>(
+        handle, pagerank_usecase.test_weighted, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 3. generate personalization vertex/value pairs
+
+    std::optional<std::vector<vertex_t>> h_mg_personalization_vertices{std::nullopt};
+    std::optional<std::vector<result_t>> h_mg_personalization_values{std::nullopt};
+    if (pagerank_usecase.personalization_ratio > 0.0) {
+      std::default_random_engine generator{
+        static_cast<long unsigned int>(comm.get_rank()) /* seed */};
+      std::uniform_real_distribution<double> distribution{0.0, 1.0};
+      h_mg_personalization_vertices =
+        std::vector<vertex_t>(mg_graph_view.get_number_of_local_vertices());
+      std::iota((*h_mg_personalization_vertices).begin(),
+                (*h_mg_personalization_vertices).end(),
+                mg_graph_view.get_local_vertex_first());
+      (*h_mg_personalization_vertices)
+        .erase(std::remove_if((*h_mg_personalization_vertices).begin(),
+                              (*h_mg_personalization_vertices).end(),
+                              [&generator, &distribution, pagerank_usecase](auto v) {
+                                return distribution(generator) >=
+                                       pagerank_usecase.personalization_ratio;
+                              }),
+               (*h_mg_personalization_vertices).end());
+      h_mg_personalization_values = std::vector<result_t>((*h_mg_personalization_vertices).size());
+      std::for_each((*h_mg_personalization_values).begin(),
+                    (*h_mg_personalization_values).end(),
+                    [&distribution, &generator](auto& val) { val = distribution(generator); });
+    }
+
+    auto d_mg_personalization_vertices =
+      h_mg_personalization_vertices
+        ? std::make_optional<rmm::device_uvector<vertex_t>>((*h_mg_personalization_vertices).size(),
+                                                            handle.get_stream())
+        : std::nullopt;
+    auto d_mg_personalization_values =
+      h_mg_personalization_values ? std::make_optional<rmm::device_uvector<result_t>>(
+                                      (*d_mg_personalization_vertices).size(), handle.get_stream())
+                                  : std::nullopt;
+    if (d_mg_personalization_vertices) {
+      raft::update_device((*d_mg_personalization_vertices).data(),
+                          (*h_mg_personalization_vertices).data(),
+                          (*h_mg_personalization_vertices).size(),
+                          handle.get_stream());
+      raft::update_device((*d_mg_personalization_values).data(),
+                          (*h_mg_personalization_values).data(),
+                          (*h_mg_personalization_values).size(),
+                          handle.get_stream());
+    }
+
+    // 4. run MG PageRank
+
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
+    rmm::device_uvector<result_t> d_mg_pageranks(mg_graph_view.get_number_of_local_vertices(),
+                                                 handle.get_stream());
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    cugraph::experimental::pagerank<vertex_t, edge_t, weight_t>(
+      handle,
+      mg_graph_view,
+      std::nullopt,
+      d_mg_personalization_vertices
+        ? std::optional<vertex_t const*>{(*d_mg_personalization_vertices).data()}
+        : std::nullopt,
+      d_mg_personalization_values
+        ? std::optional<result_t const*>{(*d_mg_personalization_values).data()}
+        : std::nullopt,
+      d_mg_personalization_vertices
+        ? std::optional{static_cast<vertex_t>((*d_mg_personalization_vertices).size())}
+        : std::nullopt,
+      d_mg_pageranks.data(),
+      alpha,
+      epsilon,
+      std::numeric_limits<size_t>::max(),
+      false);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG PageRank took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    // 5. copmare SG & MG results
+
+    if (pagerank_usecase.check_correctness) {
+      // 5-1. aggregate MG results
+
+      auto d_mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
+        handle, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
+      auto d_mg_aggregate_personalization_vertices =
+        d_mg_personalization_vertices
+          ? std::optional<rmm::device_uvector<vertex_t>>{cugraph::test::device_gatherv(
+              handle,
+              (*d_mg_personalization_vertices).data(),
+              (*d_mg_personalization_vertices).size())}
+          : std::nullopt;
+      auto d_mg_aggregate_personalization_values =
+        d_mg_personalization_values
+          ? std::optional<rmm::device_uvector<result_t>>{cugraph::test::device_gatherv(
+              handle, (*d_mg_personalization_values).data(), (*d_mg_personalization_values).size())}
+          : std::nullopt;
+      auto d_mg_aggregate_pageranks =
+        cugraph::test::device_gatherv(handle, d_mg_pageranks.data(), d_mg_pageranks.size());
+
+      if (handle.get_comms().get_rank() == int{0}) {
+        // 5-2. unrenumbr MG results
+
+        if (d_mg_aggregate_personalization_vertices) {
+          cugraph::experimental::unrenumber_int_vertices<vertex_t, false>(
+            handle,
+            (*d_mg_aggregate_personalization_vertices).data(),
+            (*d_mg_aggregate_personalization_vertices).size(),
+            d_mg_aggregate_renumber_map_labels.data(),
+            vertex_t{0},
+            mg_graph_view.get_number_of_vertices(),
+            std::vector<vertex_t>{mg_graph_view.get_number_of_vertices()});
+          std::tie(d_mg_aggregate_personalization_vertices, d_mg_aggregate_personalization_values) =
+            cugraph::test::sort_by_key(handle,
+                                       (*d_mg_aggregate_personalization_vertices).data(),
+                                       (*d_mg_aggregate_personalization_values).data(),
+                                       (*d_mg_aggregate_personalization_vertices).size());
+        }
+        std::tie(std::ignore, d_mg_aggregate_pageranks) =
+          cugraph::test::sort_by_key(handle,
+                                     d_mg_aggregate_renumber_map_labels.data(),
+                                     d_mg_aggregate_pageranks.data(),
+                                     d_mg_aggregate_renumber_map_labels.size());
+
+        // 5-3. create SG graph
+
+        cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> sg_graph(handle);
+        std::tie(sg_graph, std::ignore) =
+          input_usecase.template construct_graph<vertex_t, edge_t, weight_t, true, false>(
+            handle, pagerank_usecase.test_weighted, false);
+
+        auto sg_graph_view = sg_graph.view();
+
+        ASSERT_EQ(mg_graph_view.get_number_of_vertices(), sg_graph_view.get_number_of_vertices());
+
+        // 5-4. run SG PageRank
+
+        rmm::device_uvector<result_t> d_sg_pageranks(sg_graph_view.get_number_of_vertices(),
+                                                     handle.get_stream());
+
+        cugraph::experimental::pagerank<vertex_t, edge_t, weight_t>(
+          handle,
+          sg_graph_view,
+          std::nullopt,
+          d_mg_aggregate_personalization_vertices
+            ? std::optional<vertex_t const*>{(*d_mg_aggregate_personalization_vertices).data()}
+            : std::nullopt,
+          d_mg_aggregate_personalization_values
+            ? std::optional<result_t const*>{(*d_mg_aggregate_personalization_values).data()}
+            : std::nullopt,
+          d_mg_aggregate_personalization_vertices
+            ? std::optional<vertex_t>{static_cast<vertex_t>(
+                (*d_mg_aggregate_personalization_vertices).size())}
+            : std::nullopt,
+          d_sg_pageranks.data(),
+          alpha,
+          epsilon,
+          std::numeric_limits<size_t>::max(),  // max_iterations
+          false);
+
+        // 5-4. compare
+
+        std::vector<result_t> h_mg_aggregate_pageranks(mg_graph_view.get_number_of_vertices());
+        raft::update_host(h_mg_aggregate_pageranks.data(),
+                          d_mg_aggregate_pageranks.data(),
+                          d_mg_aggregate_pageranks.size(),
+                          handle.get_stream());
+
+        std::vector<result_t> h_sg_pageranks(sg_graph_view.get_number_of_vertices());
+        raft::update_host(
+          h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        auto threshold_ratio = 1e-3;
+        auto threshold_magnitude =
+          (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
+          threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
+        auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+          return std::abs(lhs - rhs) <
+                 std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+        };
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_pageranks.begin(),
+                               h_mg_aggregate_pageranks.end(),
+                               h_sg_pageranks.begin(),
+                               nearly_equal));
+      }
+    }
+  }
+};
+
+using Tests_MGPageRank_File = Tests_MGPageRank<cugraph::test::File_Usecase>;
+using Tests_MGPageRank_Rmat = Tests_MGPageRank<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGPageRank_File, CheckInt32Int32FloatFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGPageRank_Rmat, CheckInt32Int32FloatFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, float>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGPageRank_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(PageRank_Usecase{0.0, false},
+                      PageRank_Usecase{0.5, false},
+                      PageRank_Usecase{0.0, true},
+                      PageRank_Usecase{0.5, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGPageRank_Rmat,
+  ::testing::Combine(::testing::Values(PageRank_Usecase{0.0, false},
+                                       PageRank_Usecase{0.5, false},
+                                       PageRank_Usecase{0.0, true},
+                                       PageRank_Usecase{0.5, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_tests,
+  Tests_MGPageRank_Rmat,
+  ::testing::Combine(::testing::Values(PageRank_Usecase{0.0, false, false},
+                                       PageRank_Usecase{0.5, false, false},
+                                       PageRank_Usecase{0.0, true, false},
+                                       PageRank_Usecase{0.5, true, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/pagerank/pagerank_test.cpp b/cpp/tests/pagerank/pagerank_test.cpp
deleted file mode 100644
index 48705f7f324..00000000000
--- a/cpp/tests/pagerank/pagerank_test.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Pagerank solver tests
-// Author: Alex Fender afender@nvidia.com
-
-#include <utilities/high_res_clock.h>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <algorithms.hpp>
-#include <graph.hpp>
-
-#include <raft/error.hpp>
-#include <raft/handle.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <cuda_profiler_api.h>
-
-#include <cmath>
-
-// do the perf measurements
-// enabled by command line parameter s'--perf'
-static int PERF = 0;
-
-// iterations for perf tests
-// enabled by command line parameter '--perf-iters"
-static int PERF_MULTIPLIER = 5;
-
-typedef struct Pagerank_Usecase_t {
-  std::string matrix_file;
-  std::string result_file;
-  Pagerank_Usecase_t(const std::string& a, const std::string& b)
-  {
-    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((a != "") && (a[0] != '/')) {
-      matrix_file = rapidsDatasetRootDir + "/" + a;
-    } else {
-      matrix_file = a;
-    }
-    if ((b != "") && (b[0] != '/')) {
-      result_file = rapidsDatasetRootDir + "/" + b;
-    } else {
-      result_file = b;
-    }
-  }
-  Pagerank_Usecase_t& operator=(const Pagerank_Usecase_t& rhs)
-  {
-    matrix_file = rhs.matrix_file;
-    result_file = rhs.result_file;
-    return *this;
-  }
-} Pagerank_Usecase;
-
-class Tests_Pagerank : public ::testing::TestWithParam<Pagerank_Usecase> {
- public:
-  Tests_Pagerank() {}
-  static void SetupTestCase() {}
-  static void TearDownTestCase()
-  {
-    if (PERF) {
-      for (unsigned int i = 0; i < pagerank_time.size(); ++i) {
-        std::cout << pagerank_time[i] / PERF_MULTIPLIER << std::endl;
-      }
-    }
-  }
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  static std::vector<double> pagerank_time;
-
-  template <typename T>
-  void run_current_test(const Pagerank_Usecase& param)
-  {
-    const ::testing::TestInfo* const test_info =
-      ::testing::UnitTest::GetInstance()->current_test_info();
-    std::stringstream ss;
-    std::string test_id = std::string(test_info->test_case_name()) + std::string(".") +
-                          std::string(test_info->name()) + std::string("_") +
-                          cugraph::test::getFileName(param.matrix_file) + std::string("_") +
-                          ss.str().c_str();
-
-    int m, k, nnz;
-    MM_typecode mc;
-
-    float tol = 1E-5f;
-
-    // Default parameters
-    /*
-    float alpha = 0.85;
-    int max_iter = 500;
-    bool has_guess = false;
-    */
-
-    HighResClock hr_clock;
-    double time_tmp;
-
-    FILE* fpin = fopen(param.matrix_file.c_str(), "r");
-    ASSERT_NE(fpin, nullptr) << "fopen (" << param.matrix_file << ") failure.";
-
-    ASSERT_EQ(cugraph::test::mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz), 0)
-      << "could not read Matrix Market file properties"
-      << "\n";
-    ASSERT_TRUE(mm_is_matrix(mc));
-    ASSERT_TRUE(mm_is_coordinate(mc));
-    ASSERT_FALSE(mm_is_complex(mc));
-    ASSERT_FALSE(mm_is_skew(mc));
-
-    // Allocate memory on host
-    std::vector<int> cooRowInd(nnz), cooColInd(nnz);
-    std::vector<T> cooVal(nnz), pagerank(m);
-
-    // device alloc
-    rmm::device_uvector<T> pagerank_vector(static_cast<size_t>(m), nullptr);
-    T* d_pagerank = pagerank_vector.data();
-
-    // Read
-    ASSERT_EQ((cugraph::test::mm_to_coo<int, T>(
-                fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)),
-              0)
-      << "could not read matrix data"
-      << "\n";
-    ASSERT_EQ(fclose(fpin), 0);
-
-    //  Pagerank runs on CSC, so feed COOtoCSR the row/col backwards.
-    raft::handle_t handle;
-    cugraph::GraphCOOView<int, int, T> G_coo(&cooColInd[0], &cooRowInd[0], &cooVal[0], m, nnz);
-    auto G_unique = cugraph::coo_to_csr(G_coo);
-    cugraph::GraphCSCView<int, int, T> G(G_unique->view().offsets,
-                                         G_unique->view().indices,
-                                         G_unique->view().edge_data,
-                                         G_unique->view().number_of_vertices,
-                                         G_unique->view().number_of_edges);
-
-    cudaDeviceSynchronize();
-    if (PERF) {
-      hr_clock.start();
-      for (int i = 0; i < PERF_MULTIPLIER; ++i) {
-        cugraph::pagerank<int, int, T>(handle, G, d_pagerank);
-        cudaDeviceSynchronize();
-      }
-      hr_clock.stop(&time_tmp);
-      pagerank_time.push_back(time_tmp);
-    } else {
-      cudaProfilerStart();
-      cugraph::pagerank<int, int, T>(handle, G, d_pagerank);
-      cudaProfilerStop();
-      cudaDeviceSynchronize();
-    }
-
-    // Check vs golden data
-    if (param.result_file.length() > 0) {
-      std::vector<T> calculated_res(m);
-
-      CUDA_TRY(cudaMemcpy(&calculated_res[0], d_pagerank, sizeof(T) * m, cudaMemcpyDeviceToHost));
-      std::sort(calculated_res.begin(), calculated_res.end());
-      fpin = fopen(param.result_file.c_str(), "rb");
-      ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file
-                                << std::endl;
-      std::vector<T> expected_res(m);
-      ASSERT_EQ(cugraph::test::read_binary_vector(fpin, m, expected_res), 0);
-      fclose(fpin);
-      T err;
-      int n_err = 0;
-      for (int i = 0; i < m; i++) {
-        err = fabs(expected_res[i] - calculated_res[i]);
-        if (err > tol * 1.1) {
-          n_err++;  // count the number of mismatches
-        }
-      }
-      if (n_err) {
-        EXPECT_LE(n_err, 0.001 * m);  // we tolerate 0.1% of values with a litte difference
-      }
-    }
-  }
-};
-
-std::vector<double> Tests_Pagerank::pagerank_time;
-
-TEST_P(Tests_Pagerank, CheckFP32_T) { run_current_test<float>(GetParam()); }
-
-TEST_P(Tests_Pagerank, CheckFP64_T) { run_current_test<double>(GetParam()); }
-
-// --gtest_filter=*simple_test*
-INSTANTIATE_TEST_CASE_P(
-  simple_test,
-  Tests_Pagerank,
-  ::testing::Values(Pagerank_Usecase("test/datasets/karate.mtx", ""),
-                    Pagerank_Usecase("test/datasets/web-Google.mtx",
-                                     "test/ref/pagerank/web-Google.pagerank_val_0.85.bin"),
-                    Pagerank_Usecase("test/datasets/ljournal-2008.mtx",
-                                     "test/ref/pagerank/ljournal-2008.pagerank_val_0.85.bin"),
-                    Pagerank_Usecase("test/datasets/webbase-1M.mtx",
-                                     "test/ref/pagerank/webbase-1M.pagerank_val_0.85.bin")));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_count_if_v.cu b/cpp/tests/prims/mg_count_if_v.cu
new file mode 100644
index 00000000000..c956067cae8
--- /dev/null
+++ b/cpp/tests/prims/mg_count_if_v.cu
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <cuco/detail/hash_functions.cuh>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/count_if_v.cuh>
+
+#include <thrust/count.h>
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+template <typename vertex_t>
+struct test_predicate {
+  int mod{};
+  test_predicate(int mod_count) : mod(mod_count) {}
+  __device__ bool operator()(const vertex_t& val)
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    return (0 == (hash_func(val) % mod));
+  }
+};
+
+struct Prims_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MG_CountIfV
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MG_CountIfV() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of count_if_v primitive and thrust count_if on a single GPU
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        handle, true, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    const int hash_bin_count = 5;
+
+    // 3. run MG count if
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    vertex_t const* data = (*d_mg_renumber_map_labels).data();
+    auto vertex_count =
+      count_if_v(handle, mg_graph_view, data, test_predicate<vertex_t>(hash_bin_count));
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG count if took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    // 4. compare SG & MG results
+
+    if (prims_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> sg_graph(
+        handle);
+      std::tie(sg_graph, std::ignore) =
+        input_usecase.template construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+          handle, true, false);
+      auto sg_graph_view = sg_graph.view();
+      auto expected_vertex_count =
+        thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                         thrust::make_counting_iterator(sg_graph_view.get_local_vertex_first()),
+                         thrust::make_counting_iterator(sg_graph_view.get_local_vertex_last()),
+                         test_predicate<vertex_t>(hash_bin_count));
+      ASSERT_TRUE(expected_vertex_count == vertex_count);
+    }
+  }
+};
+
+using Tests_MG_CountIfV_File = Tests_MG_CountIfV<cugraph::test::File_Usecase>;
+using Tests_MG_CountIfV_Rmat = Tests_MG_CountIfV<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MG_CountIfV_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MG_CountIfV_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MG_CountIfV_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MG_CountIfV_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, true>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MG_CountIfV_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MG_CountIfV_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_MG_CountIfV_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu
new file mode 100644
index 00000000000..539e4e35ded
--- /dev/null
+++ b/cpp/tests/prims/mg_reduce_v.cu
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <cuco/detail/hash_functions.cuh>
+#include <cugraph/experimental/graph_view.hpp>
+#include <cugraph/prims/reduce_v.cuh>
+
+#include <thrust/count.h>
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
+template <typename vertex_t, typename... T>
+struct property_transform : public thrust::unary_function<vertex_t, thrust::tuple<T...>> {
+  int mod{};
+  property_transform(int mod_count) : mod(mod_count) {}
+  __device__ auto operator()(const vertex_t& val)
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto value = hash_func(val) % mod;
+    return thrust::make_tuple(static_cast<T>(value)...);
+  }
+};
+template <typename vertex_t, template <typename...> typename Tuple, typename... T>
+struct property_transform<vertex_t, Tuple<T...>> : public property_transform<vertex_t, T...> {
+};
+
+template <typename Tuple, std::size_t... I>
+auto make_iterator_tuple(Tuple& data, std::index_sequence<I...>)
+{
+  return thrust::make_tuple((std::get<I>(data).begin())...);
+}
+
+template <typename... T>
+auto get_zip_iterator(std::tuple<T...>& data)
+{
+  return thrust::make_zip_iterator(make_iterator_tuple(
+    data, std::make_index_sequence<std::tuple_size<std::tuple<T...>>::value>()));
+}
+
+template <typename T>
+auto get_property_iterator(std::tuple<T>& data)
+{
+  return (std::get<0>(data)).begin();
+}
+
+template <typename T0, typename... T>
+auto get_property_iterator(std::tuple<T0, T...>& data)
+{
+  return get_zip_iterator(data);
+}
+
+template <typename... T>
+struct generate_impl {
+  static thrust::tuple<T...> initial_value(int init)
+  {
+    return thrust::make_tuple(static_cast<T>(init)...);
+  }
+  template <typename label_t>
+  static std::tuple<rmm::device_uvector<T>...> property(rmm::device_uvector<label_t>& labels,
+                                                        int hash_bin_count,
+                                                        raft::handle_t const& handle)
+  {
+    auto data = std::make_tuple(rmm::device_uvector<T>(labels.size(), handle.get_stream())...);
+    auto zip  = get_zip_iterator(data);
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      labels.begin(),
+                      labels.end(),
+                      zip,
+                      property_transform<label_t, T...>(hash_bin_count));
+    return data;
+  }
+  template <typename label_t>
+  static std::tuple<rmm::device_uvector<T>...> property(thrust::counting_iterator<label_t> begin,
+                                                        thrust::counting_iterator<label_t> end,
+                                                        int hash_bin_count,
+                                                        raft::handle_t const& handle)
+  {
+    auto length = thrust::distance(begin, end);
+    auto data   = std::make_tuple(rmm::device_uvector<T>(length, handle.get_stream())...);
+    auto zip    = get_zip_iterator(data);
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      begin,
+                      end,
+                      zip,
+                      property_transform<label_t, T...>(hash_bin_count));
+    return data;
+  }
+};
+
+template <typename T>
+struct result_compare {
+  constexpr auto operator()(const T& t1, const T& t2) { return (t1 == t2); }
+};
+
+template <typename... Args>
+struct result_compare<thrust::tuple<Args...>> {
+  static constexpr double threshold_ratio{1e-3};
+
+ private:
+  template <typename T>
+  bool equal(T t1, T t2)
+  {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::abs(t1 - t2) < (std::max(t1, t2) * threshold_ratio);
+    }
+    return t1 == t2;
+  }
+  template <typename T, std::size_t... I>
+  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
+  {
+    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
+  }
+
+ public:
+  using Type = thrust::tuple<Args...>;
+  constexpr auto operator()(const Type& t1, const Type& t2)
+  {
+    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<Type>::value>());
+  }
+};
+
+template <typename T>
+struct generate : public generate_impl<T> {
+  static T initial_value(int init) { return static_cast<T>(init); }
+};
+template <typename... T>
+struct generate<std::tuple<T...>> : public generate_impl<T...> {
+};
+
+struct Prims_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MG_ReduceIfV
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MG_ReduceIfV() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of reduce_if_v primitive and thrust reduce on a single GPU
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool store_transposed>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
+    }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      input_usecase.template construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        handle, true, true);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 3. run MG count if
+
+    const int hash_bin_count = 5;
+    const int initial_value  = 10;
+
+    auto property_initial_value = generate<result_t>::initial_value(initial_value);
+    auto property_data =
+      generate<result_t>::property((*d_mg_renumber_map_labels), hash_bin_count, handle);
+    auto property_iter = get_property_iterator(property_data);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto result = reduce_v(handle,
+                           mg_graph_view,
+                           property_iter,
+                           property_iter + (*d_mg_renumber_map_labels).size(),
+                           property_initial_value);
+
+    if (PERF) {
+      CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG count if took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    //// 4. compare SG & MG results
+
+    if (prims_usecase.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false> sg_graph(
+        handle);
+      std::tie(sg_graph, std::ignore) =
+        input_usecase.template construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+          handle, true, false);
+      auto sg_graph_view = sg_graph.view();
+
+      auto sg_property_data = generate<result_t>::property(
+        thrust::make_counting_iterator(sg_graph_view.get_local_vertex_first()),
+        thrust::make_counting_iterator(sg_graph_view.get_local_vertex_last()),
+        hash_bin_count,
+        handle);
+      auto sg_property_iter = get_property_iterator(sg_property_data);
+      using property_t      = decltype(property_initial_value);
+
+      auto expected_result =
+        thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       sg_property_iter,
+                       sg_property_iter + sg_graph_view.get_number_of_local_vertices(),
+                       property_initial_value,
+                       cugraph::experimental::property_add<property_t>());
+      result_compare<property_t> compare;
+      ASSERT_TRUE(compare(expected_result, result));
+    }
+  }
+};
+
+using Tests_MG_ReduceIfV_File = Tests_MG_ReduceIfV<cugraph::test::File_Usecase>;
+using Tests_MG_ReduceIfV_Rmat = Tests_MG_ReduceIfV<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MG_ReduceIfV_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
+                                                                           std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
+                                                                           std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
+                                                                          std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
+                                                                          std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MG_ReduceIfV_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MG_ReduceIfV_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MG_ReduceIfV_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_MG_ReduceIfV_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/renumber/renumber_test.cu b/cpp/tests/renumber/renumber_test.cu
deleted file mode 100644
index a7102402acf..00000000000
--- a/cpp/tests/renumber/renumber_test.cu
+++ /dev/null
@@ -1,579 +0,0 @@
-// -*-c++-*-
-
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//#include "gmock/gmock.h"
-
-#include <utilities/base_fixture.hpp>
-
-#include <converters/renumber.cuh>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <cuda_profiler_api.h>
-#include <curand_kernel.h>
-
-#include <chrono>
-
-struct RenumberingTest : public ::testing::Test {
-};
-
-__global__ void display_list(const char *label, uint32_t *verts, size_t length)
-{
-  printf("%s\n", label);
-
-  for (size_t i = 0; i < length; ++i) { printf("  %u\n", verts[i]); }
-}
-
-__global__ void setup_generator(curandState *state)
-{
-  int id = threadIdx.x + blockIdx.x * blockDim.x;
-  curand_init(43, id, 0, &state[id]);
-}
-
-__global__ void generate_sources(curandState *state, int n, uint32_t *verts)
-{
-  int first  = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-
-  curandState local_state = state[first];
-  for (int id = first; id < n; id += stride) { verts[id] = curand(&local_state); }
-
-  state[first] = local_state;
-}
-
-__global__ void generate_destinations(curandState *state,
-                                      int n,
-                                      const uint32_t *sources,
-                                      uint32_t *destinations)
-{
-  int first  = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-
-  curandState local_state = state[first];
-  for (int id = first; id < n; id += stride) {
-    destinations[id] = sources[curand(&local_state) % n];
-  }
-
-  state[first] = local_state;
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexList)
-{
-  uint32_t src_data[] = {4U, 6U, 8U, 20U, 1U};
-  uint32_t dst_data[] = {1U, 29U, 35U, 0U, 77U};
-
-  uint32_t src_expected[] = {2U, 3U, 4U, 5U, 1U};
-  uint32_t dst_expected[] = {1U, 6U, 7U, 0U, 8U};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  uint32_t tmp_results[length];
-  uint32_t tmp_map[2 * length];
-
-  rmm::device_vector<uint32_t> src(length);
-  rmm::device_vector<uint32_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint32_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint32_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexListNegative)
-{
-  int64_t src_data[] = {4, 6, 8, -20, 1};
-  int64_t dst_data[] = {1, 29, 35, 0, 77};
-
-  int64_t src_expected[] = {2, 3, 4, 8, 1};
-  int64_t dst_expected[] = {1, 5, 6, 0, 7};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  int64_t *src_d;
-  int64_t *dst_d;
-
-  int64_t tmp_results[length];
-  int64_t tmp_map[2 * length];
-
-  rmm::device_vector<int64_t> src(length);
-  rmm::device_vector<int64_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(int64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(int64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<int64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(
-    cudaMemcpy(tmp_map, number_map->data(), sizeof(int64_t) * unique_verts, cudaMemcpyDeviceToHost),
-    cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(int64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(int64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexList64Bit)
-{
-  uint64_t src_data[] = {4U, 6U, 8U, 20U, 1U};
-  uint64_t dst_data[] = {1U, 29U, 35U, 0U, 77U};
-
-  uint64_t src_expected[] = {2U, 3U, 4U, 5U, 1U};
-  uint64_t dst_expected[] = {1U, 6U, 7U, 0U, 8U};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  uint64_t *src_d;
-  uint64_t *dst_d;
-
-  uint64_t tmp_results[length];
-  uint64_t tmp_map[2 * length];
-
-  rmm::device_vector<uint64_t> src(length);
-  rmm::device_vector<uint64_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit)
-{
-  uint64_t src_data[] = {4U, 6U, 8U, 20U, 1U};
-  uint64_t dst_data[] = {1U, 29U, 35U, 0U, 77U};
-
-  uint32_t src_expected[] = {2U, 3U, 4U, 5U, 1U};
-  uint32_t dst_expected[] = {1U, 6U, 7U, 0U, 8U};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  uint64_t *src_d;
-  uint64_t *dst_d;
-  uint32_t *src_renumbered_d;
-  uint32_t *dst_renumbered_d;
-
-  uint32_t tmp_results[length];
-  uint64_t tmp_map[2 * length];
-
-  rmm::device_vector<uint64_t> src(length);
-  rmm::device_vector<uint64_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-  rmm::device_vector<uint32_t> src_renumbered(length);
-  rmm::device_vector<uint32_t> dst_renumbered(length);
-  src_renumbered_d = src_renumbered.data().get();
-  dst_renumbered_d = dst_renumbered.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_renumbered_d,
-                                                       dst_renumbered_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(
-    cudaMemcpy(tmp_results, src_renumbered_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-    cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(
-    cudaMemcpy(tmp_results, dst_renumbered_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-    cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, Random100KVertexSet)
-{
-  const int num_verts = 100000;
-
-  uint64_t *src_d;
-  uint64_t *dst_d;
-
-  std::vector<uint64_t> src_data_vec(num_verts);
-  std::vector<uint64_t> dst_data_vec(num_verts);
-  std::vector<uint64_t> tmp_results_vec(num_verts);
-  std::vector<uint64_t> tmp_map_vec(2 * num_verts);
-
-  uint64_t *src_data    = src_data_vec.data();
-  uint64_t *dst_data    = dst_data_vec.data();
-  uint64_t *tmp_results = tmp_results_vec.data();
-  uint64_t *tmp_map     = tmp_map_vec.data();
-  rmm::device_vector<uint64_t> src(num_verts);
-  rmm::device_vector<uint64_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Generate random source and vertex values
-  //
-  srand(43);
-
-  for (int i = 0; i < num_verts; ++i) { src_data[i] = (uint64_t)rand(); }
-
-  for (int i = 0; i < num_verts; ++i) { dst_data[i] = (uint64_t)rand(); }
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * num_verts, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * num_verts, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-
-  auto number_map = cugraph::detail::renumber_vertices(n_verts,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint64_t) * num_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  size_t min_id = unique_verts;
-  size_t max_id = 0;
-
-  size_t cnt = 0;
-  for (size_t i = 0; i < num_verts; ++i) {
-    min_id = min(min_id, tmp_results[i]);
-    max_id = max(max_id, tmp_results[i]);
-    if (tmp_map[tmp_results[i]] != src_data[i]) ++cnt;
-
-    if (cnt < 20) EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  if (cnt > 0) printf("  src error count = %ld out of %d\n", cnt, num_verts);
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint64_t) * num_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < num_verts; ++i) {
-    min_id = min(min_id, tmp_results[i]);
-    max_id = max(max_id, tmp_results[i]);
-    if (tmp_map[tmp_results[i]] != dst_data[i]) ++cnt;
-
-    if (cnt < 20) EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-
-  if (cnt > 0) printf("  src error count = %ld out of %d\n", cnt, num_verts);
-
-  EXPECT_EQ(min_id, 0);
-  EXPECT_EQ(max_id, (unique_verts - 1));
-}
-
-TEST_F(RenumberingTest, Random10MVertexSet)
-{
-  const int num_verts = 10000000;
-
-  //  A sampling of performance on single Quadro GV100
-  // const int hash_size =  32767;       // 238 ms
-  // const int hash_size =  8191;      // 224 ms
-  const int hash_size = 511;  // 224 ms
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  rmm::device_vector<uint32_t> src(num_verts);
-  rmm::device_vector<uint32_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Init the random number generate
-  //
-  const int num_threads{64};
-  curandState *state;
-
-  rmm::device_vector<curandState> state_vals(num_threads);
-  state = state_vals.data().get();
-  setup_generator<<<num_threads, 1>>>(state);
-  generate_sources<<<num_threads, 1>>>(state, num_verts, src_d);
-  generate_destinations<<<num_threads, 1>>>(state, num_verts, src_d, dst_d);
-
-  std::cout << "done with initialization" << std::endl;
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-  auto number_map =
-    cugraph::detail::renumber_vertices(n_verts,
-                                       src_d,
-                                       dst_d,
-                                       src_d,
-                                       dst_d,
-                                       &unique_verts,
-                                       cugraph::detail::HashFunctionObjectInt(hash_size),
-                                       thrust::less<uint64_t>(),
-                                       rmm::mr::get_current_device_resource());
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-  std::cout << "  unique verts = " << unique_verts << std::endl;
-  std::cout << "  hash size = " << hash_size << std::endl;
-}
-
-TEST_F(RenumberingTest, Random100MVertexSet)
-{
-  const int num_verts = 100000000;
-
-  //  A sampling of performance on single Quadro GV100
-  // const int hash_size =  8192;        // 1811 ms
-  // const int hash_size =  16384;       // 1746 ms
-  // const int hash_size =  32768;       // 1662 ms
-  // const int hash_size =  65536;       // 1569 ms
-  // const int hash_size =  16777216;      // 1328 ms
-  const int hash_size = 511;
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  rmm::device_vector<uint32_t> src(num_verts);
-  rmm::device_vector<uint32_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Init the random number generate
-  //
-  const int num_threads{64};
-  curandState *state;
-
-  rmm::device_vector<curandState> state_vals(num_threads);
-  state = state_vals.data().get();
-  setup_generator<<<num_threads, 1>>>(state);
-  generate_sources<<<num_threads, 1>>>(state, num_verts, src_d);
-  generate_destinations<<<num_threads, 1>>>(state, num_verts, src_d, dst_d);
-
-  std::cout << "done with initialization" << std::endl;
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-  auto number_map =
-    cugraph::detail::renumber_vertices(n_verts,
-                                       src_d,
-                                       dst_d,
-                                       src_d,
-                                       dst_d,
-                                       &unique_verts,
-                                       cugraph::detail::HashFunctionObjectInt(hash_size),
-                                       thrust::less<uint64_t>(),
-                                       rmm::mr::get_current_device_resource());
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-  std::cout << "  unique verts = " << unique_verts << std::endl;
-  std::cout << "  hash size = " << hash_size << std::endl;
-}
-
-TEST_F(RenumberingTest, Random500MVertexSet)
-{
-  const int num_verts = 500000000;
-
-  //  A sampling of performance on single Quadro GV100
-  // const int hash_size =  8192;      // 9918 ms
-  // const int hash_size =  16384;      // 9550 ms
-  // const int hash_size =  32768;      // 9146 ms
-  // const int hash_size =  131072;      // 8537 ms
-  const int hash_size = 1048576;  // 7335 ms
-  // const int hash_size =  511;      // 7335 ms
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  rmm::device_vector<uint32_t> src(num_verts);
-  rmm::device_vector<uint32_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Init the random number generate
-  //
-  const int num_threads{64};
-  curandState *state;
-
-  rmm::device_vector<curandState> state_vals(num_threads);
-  state = state_vals.data().get();
-  setup_generator<<<num_threads, 1>>>(state);
-  generate_sources<<<num_threads, 1>>>(state, num_verts, src_d);
-  generate_destinations<<<num_threads, 1>>>(state, num_verts, src_d, dst_d);
-
-  std::cout << "done with initialization" << std::endl;
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-  auto number_map =
-    cugraph::detail::renumber_vertices(n_verts,
-                                       src_d,
-                                       dst_d,
-                                       src_d,
-                                       dst_d,
-                                       &unique_verts,
-                                       cugraph::detail::HashFunctionObjectInt(hash_size),
-                                       thrust::less<uint64_t>(),
-                                       rmm::mr::get_current_device_resource());
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-  std::cout << "  unique verts = " << unique_verts << std::endl;
-  std::cout << "  hash size = " << hash_size << std::endl;
-}
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/random_walks_profiling.cu b/cpp/tests/sampling/random_walks_profiling.cu
new file mode 100644
index 00000000000..595086a570e
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_profiling.cu
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>  // cugraph::test::create_memory_resource()
+#include <utilities/high_res_timer.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <sampling/random_walks.cuh>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <cuda_profiler_api.h>
+#include <thrust/random.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+/**
+ * @internal
+ * @brief Populates the device vector d_start with the starting vertex indices
+ * to be used for each RW path specified.
+ */
+template <typename vertex_t, typename index_t>
+void fill_start(raft::handle_t const& handle,
+                rmm::device_uvector<vertex_t>& d_start,
+                index_t num_vertices)
+{
+  index_t num_paths = d_start.size();
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    thrust::make_counting_iterator<index_t>(0),
+                    thrust::make_counting_iterator<index_t>(num_paths),
+
+                    d_start.begin(),
+                    [num_vertices] __device__(auto indx) { return indx % num_vertices; });
+}
+
+namespace impl_details = cugraph::experimental::detail;
+
+enum class traversal_id_t : int { HORIZONTAL = 0, VERTICAL };
+
+/**
+ * @internal
+ * @brief Calls the random_walks algorithm with specified traversal strategy and displays the time
+ * metrics (total time for all requested paths, average time for each path).
+ */
+template <typename graph_vt>
+void output_random_walks_time(graph_vt const& graph_view,
+                              typename graph_vt::edge_type num_paths,
+                              traversal_id_t trv_id)
+{
+  using vertex_t = typename graph_vt::vertex_type;
+  using edge_t   = typename graph_vt::edge_type;
+  using weight_t = typename graph_vt::weight_type;
+
+  raft::handle_t handle{};
+  rmm::device_uvector<vertex_t> d_start(num_paths, handle.get_stream());
+
+  vertex_t num_vertices = graph_view.get_number_of_vertices();
+  fill_start(handle, d_start, num_vertices);
+
+  // 0-copy const device view:
+  //
+  impl_details::device_const_vector_view<vertex_t, edge_t> d_start_view{d_start.data(), num_paths};
+
+  edge_t max_depth{10};
+
+  HighResTimer hr_timer;
+  std::string label{};
+
+  if (trv_id == traversal_id_t::HORIZONTAL) {
+    label = std::string("RandomWalks; Horizontal traversal");
+    hr_timer.start(label);
+    cudaProfilerStart();
+    auto ret_tuple =
+      impl_details::random_walks_impl<graph_vt, impl_details::horizontal_traversal_t>(
+        handle, graph_view, d_start_view, max_depth);
+    cudaProfilerStop();
+    hr_timer.stop();
+  } else {
+    label = std::string("RandomWalks; Vertical traversal");
+    hr_timer.start(label);
+    cudaProfilerStart();
+    auto ret_tuple = impl_details::random_walks_impl<graph_vt, impl_details::vertical_traversal_t>(
+      handle, graph_view, d_start_view, max_depth);
+    cudaProfilerStop();
+    hr_timer.stop();
+  }
+  try {
+    auto runtime = hr_timer.get_average_runtime(label);
+
+    std::cout << "RW for num_paths: " << num_paths
+              << ", runtime [ms] / path: " << runtime / num_paths << ":\n";
+
+  } catch (std::exception const& ex) {
+    std::cerr << ex.what() << '\n';
+    return;
+
+  } catch (...) {
+    std::cerr << "ERROR: Unknown exception on timer label search." << '\n';
+    return;
+  }
+  hr_timer.display(std::cout);
+}
+
+/**
+ * @struct RandomWalks_Usecase
+ * @brief Used to specify input to a random_walks benchmark/profile run
+ *
+ * @var RandomWalks_Usecase::graph_file_full_path  Computed during construction
+ * to be an absolute path consisting of the value of the RAPIDS_DATASET_ROOT_DIR
+ * env var and the graph_file_path constructor arg. This is initialized to an
+ * empty string.
+ *
+ * @var RandomWalks_Usecase::test_weighted Bool representing if the specified
+ * graph is weighted or not. This is initialized to false (unweighted).
+ */
+struct RandomWalks_Usecase {
+  std::string graph_file_full_path{};
+  bool test_weighted{false};
+
+  RandomWalks_Usecase(std::string const& graph_file_path, bool test_weighted)
+    : test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+/**
+ * @brief Runs random_walks on a specified input and outputs time metrics
+ *
+ * Creates a graph_t instance from the configuration specified in the
+ * RandomWalks_Usecase instance passed in (currently by reading a dataset to
+ * populate the graph_t), then runs random_walks to generate 1, 10, and 100
+ * random paths and output statistics for each.
+ *
+ * @tparam vertex_t          Type of vertex identifiers.
+ * @tparam edge_t            Type of edge identifiers.
+ * @tparam weight_t          Type of weight identifiers.
+ *
+ * @param[in] configuration RandomWalks_Usecase instance containing the input
+ * file to read for constructing the graph_t.
+ * @param[in] trv_id traversal strategy.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+void run(RandomWalks_Usecase const& configuration, traversal_id_t trv_id)
+{
+  raft::handle_t handle{};
+
+  cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+  std::tie(graph, std::ignore) =
+    cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+      handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+
+  auto graph_view = graph.view();
+
+  // FIXME: the num_paths vector might be better specified via the
+  // configuration input instead of hardcoding here.
+  std::vector<edge_t> v_np{1, 10, 100};
+  for (auto&& num_paths : v_np) {
+    output_random_walks_time(graph_view, num_paths, trv_id);
+  }
+}
+
+/**
+ * @brief Performs the random_walks benchmark/profiling run
+ *
+ * main function for performing the random_walks benchmark/profiling run. The
+ * resulting executable takes the following options: "rmm_mode" which can be one
+ * of "binning", "cuda", "pool", or "managed.  "dataset" which is a path
+ * relative to the env var RAPIDS_DATASET_ROOT_DIR to a input .mtx file to use
+ * to populate the graph_t instance.
+ *
+ * To use the default values of rmm_mode=pool and
+ * dataset=test/datasets/karate.mtx:
+ * @code
+ *   RANDOM_WALKS_PROFILING
+ * @endcode
+ *
+ * To specify managed memory and the netscience.mtx dataset (relative to a
+ * particular RAPIDS_DATASET_ROOT_DIR setting):
+ * @code
+ *   RANDOM_WALKS_PROFILING --rmm_mode=managed --dataset=test/datasets/netscience.mtx
+ * @endcode
+ *
+ * @return An int representing a successful run. 0 indicates success.
+ */
+int main(int argc, char** argv)
+{
+  // Add command-line processing, provide defaults
+  cxxopts::Options options(argv[0], " - Random Walks benchmark command line options");
+  options.add_options()(
+    "rmm_mode", "RMM allocation mode", cxxopts::value<std::string>()->default_value("pool"));
+  options.add_options()(
+    "dataset", "dataset", cxxopts::value<std::string>()->default_value("test/datasets/karate.mtx"));
+  auto const cmd_options = options.parse(argc, argv);
+  auto const rmm_mode    = cmd_options["rmm_mode"].as<std::string>();
+  auto const dataset     = cmd_options["dataset"].as<std::string>();
+
+  // Configure RMM
+  auto resource = cugraph::test::create_memory_resource(rmm_mode);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  // Run benchmarks
+  std::cout << "Using dataset: " << dataset << std::endl;
+
+  std::cout << "Horizontal traversal strategy:\n";
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL);
+
+  std::cout << "Vertical traversal strategy:\n";
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL);
+
+  // FIXME: consider returning non-zero for situations that warrant it (eg. if
+  // the algo ran but the results are invalid, if a benchmark threshold is
+  // exceeded, etc.)
+  return 0;
+}
diff --git a/cpp/tests/sampling/random_walks_test.cu b/cpp/tests/sampling/random_walks_test.cu
new file mode 100644
index 00000000000..dc73c474356
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_test.cu
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_profiler_api.h"
+#include "gtest/gtest.h"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/random.h>
+
+#include <cugraph/algorithms.hpp>
+#include <sampling/random_walks.cuh>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include "random_walks_utils.cuh"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <tuple>
+#include <utilities/high_res_timer.hpp>
+#include <vector>
+
+namespace {  // anonym.
+template <typename vertex_t, typename index_t>
+void fill_start(raft::handle_t const& handle,
+                rmm::device_uvector<vertex_t>& d_start,
+                index_t num_vertices)
+{
+  index_t num_paths = d_start.size();
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    thrust::make_counting_iterator<index_t>(0),
+                    thrust::make_counting_iterator<index_t>(num_paths),
+
+                    d_start.begin(),
+                    [num_vertices] __device__(auto indx) { return indx % num_vertices; });
+}
+}  // namespace
+
+namespace impl_details = cugraph::experimental::detail;
+
+enum class traversal_id_t : int { HORIZONTAL = 0, VERTICAL };
+
+struct RandomWalks_Usecase {
+  std::string graph_file_full_path{};
+  bool test_weighted{false};
+
+  RandomWalks_Usecase(std::string const& graph_file_path, bool test_weighted)
+    : test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+class Tests_RandomWalks
+  : public ::testing::TestWithParam<std::tuple<traversal_id_t, RandomWalks_Usecase>> {
+ public:
+  Tests_RandomWalks() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<traversal_id_t, RandomWalks_Usecase> const& configuration)
+  {
+    raft::handle_t handle{};
+
+    // debuf info:
+    //
+    // std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl;
+
+    traversal_id_t trv_id = std::get<0>(configuration);
+    auto const& target    = std::get<1>(configuration);
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+    std::tie(graph, std::ignore) =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+        handle, target.graph_file_full_path, target.test_weighted, false);
+
+    auto graph_view = graph.view();
+
+    // call random_walks:
+    start_random_walks(graph_view, trv_id);
+  }
+
+  template <typename graph_vt>
+  void start_random_walks(graph_vt const& graph_view, traversal_id_t trv_id)
+  {
+    using vertex_t = typename graph_vt::vertex_type;
+    using edge_t   = typename graph_vt::edge_type;
+    using weight_t = typename graph_vt::weight_type;
+
+    raft::handle_t handle{};
+    edge_t num_paths = 10;
+    rmm::device_uvector<vertex_t> d_start(num_paths, handle.get_stream());
+
+    vertex_t num_vertices = graph_view.get_number_of_vertices();
+    fill_start(handle, d_start, num_vertices);
+
+    // 0-copy const device view:
+    //
+    impl_details::device_const_vector_view<vertex_t, edge_t> d_start_view{d_start.data(),
+                                                                          num_paths};
+
+    edge_t max_depth{10};
+
+    if (trv_id == traversal_id_t::HORIZONTAL) {
+      auto ret_tuple =
+        impl_details::random_walks_impl<graph_vt, impl_details::horizontal_traversal_t>(
+          handle, graph_view, d_start_view, max_depth);
+
+      // check results:
+      //
+      bool test_all_paths = cugraph::test::host_check_rw_paths(
+        handle, graph_view, std::get<0>(ret_tuple), std::get<1>(ret_tuple), std::get<2>(ret_tuple));
+
+      if (!test_all_paths)
+        std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
+
+      ASSERT_TRUE(test_all_paths);
+    } else {  // VERTICAL
+      auto ret_tuple =
+        impl_details::random_walks_impl<graph_vt, impl_details::vertical_traversal_t>(
+          handle, graph_view, d_start_view, max_depth);
+
+      // check results:
+      //
+      bool test_all_paths = cugraph::test::host_check_rw_paths(
+        handle, graph_view, std::get<0>(ret_tuple), std::get<1>(ret_tuple), std::get<2>(ret_tuple));
+
+      if (!test_all_paths)
+        std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
+
+      ASSERT_TRUE(test_all_paths);
+    }
+  }
+};
+
+TEST_P(Tests_RandomWalks, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_RandomWalks,
+  ::testing::Combine(::testing::Values(traversal_id_t::HORIZONTAL, traversal_id_t::VERTICAL),
+                     ::testing::Values(RandomWalks_Usecase("test/datasets/karate.mtx", true),
+                                       RandomWalks_Usecase("test/datasets/web-Google.mtx", true),
+                                       RandomWalks_Usecase("test/datasets/ljournal-2008.mtx", true),
+                                       RandomWalks_Usecase("test/datasets/webbase-1M.mtx", true))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/random_walks_utils.cuh b/cpp/tests/sampling/random_walks_utils.cuh
new file mode 100644
index 00000000000..df42242e6fe
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_utils.cuh
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <sampling/random_walks.cuh>
+
+#include <raft/handle.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+// utilities for testing / verification of Random Walks functionality:
+//
+namespace cugraph {
+namespace test {
+
+template <typename value_t>
+using vector_test_t = cugraph::experimental::detail::device_vec_t<value_t>;  // for debug purposes
+
+// host side utility to check a if a sequence of vertices is connected:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool host_check_path(std::vector<edge_t> const& row_offsets,
+                     std::vector<vertex_t> const& col_inds,
+                     std::vector<weight_t> const& values,
+                     typename std::vector<vertex_t>::const_iterator v_path_begin,
+                     typename std::vector<vertex_t>::const_iterator v_path_end,
+                     typename std::vector<weight_t>::const_iterator w_path_begin)
+{
+  bool assert1 = (row_offsets.size() > 0);
+  bool assert2 = (col_inds.size() == values.size());
+
+  vertex_t num_rows = row_offsets.size() - 1;
+  edge_t nnz        = row_offsets.back();
+
+  bool assert3 = (nnz == static_cast<edge_t>(col_inds.size()));
+  if (assert1 == false || assert2 == false || assert3 == false) {
+    std::cerr << "CSR inconsistency\n";
+    return false;
+  }
+
+  auto it_w = w_path_begin;
+  for (auto it_v = v_path_begin; it_v != v_path_end - 1; ++it_v, ++it_w) {
+    auto crt_vertex  = *it_v;
+    auto next_vertex = *(it_v + 1);
+
+    auto begin      = col_inds.begin() + row_offsets[crt_vertex];
+    auto end        = col_inds.begin() + row_offsets[crt_vertex + 1];
+    auto found_next = std::find_if(
+      begin, end, [next_vertex](auto dst_vertex) { return dst_vertex == next_vertex; });
+    if (found_next == end) {
+      std::cerr << "vertex not found: " << next_vertex << " as neighbor of " << crt_vertex << '\n';
+      return false;
+    }
+
+    auto delta = row_offsets[crt_vertex] + std::distance(begin, found_next);
+
+    // std::cerr << "delta in ci: " << delta << '\n';
+    auto found_edge = values.begin() + delta;
+    if (*found_edge != *it_w) {
+      std::cerr << "weight not found: " << *found_edge << " between " << crt_vertex << " and "
+                << next_vertex << '\n';
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename index_t = edge_t>
+bool host_check_rw_paths(
+  raft::handle_t const& handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, false, false> const& graph_view,
+  vector_test_t<vertex_t> const& d_coalesced_v,
+  vector_test_t<weight_t> const& d_coalesced_w,
+  vector_test_t<index_t> const& d_sizes,
+  index_t num_paths = 0)  // only relevant for the padded case (in which case it must be non-zero)
+{
+  edge_t num_edges      = graph_view.get_number_of_edges();
+  vertex_t num_vertices = graph_view.get_number_of_vertices();
+
+  auto offsets = graph_view.get_matrix_partition_view().get_offsets();
+  auto indices = graph_view.get_matrix_partition_view().get_indices();
+  auto values  = graph_view.get_matrix_partition_view().get_weights();
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(
+    num_edges, 1);  // account for unweighted graph, for which RW provides default weights{1}
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+
+  if (values) { raft::update_host(v_vals.data(), *values, v_vals.size(), handle.get_stream()); }
+
+  std::vector<vertex_t> v_coalesced(d_coalesced_v.size());
+  std::vector<weight_t> w_coalesced(d_coalesced_w.size());
+  std::vector<index_t> v_sizes(d_sizes.size());
+
+  raft::update_host(v_coalesced.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_coalesced_v),
+                    d_coalesced_v.size(),
+                    handle.get_stream());
+  raft::update_host(w_coalesced.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_coalesced_w),
+                    d_coalesced_w.size(),
+                    handle.get_stream());
+
+  if (v_sizes.size() > 0) {  // coalesced case
+    raft::update_host(v_sizes.data(),
+                      cugraph::experimental::detail::raw_const_ptr(d_sizes),
+                      d_sizes.size(),
+                      handle.get_stream());
+  } else {  // padded case
+    if (num_paths == 0) {
+      std::cerr << "ERROR: padded case requires `num_paths` info.\n";
+      return false;
+    }
+
+    // extract sizes from v_coalesced (which now contains padded info)
+    //
+    auto max_depth     = v_coalesced.size() / num_paths;
+    auto it_start_path = v_coalesced.begin();
+    for (index_t row_index = 0; row_index < num_paths; ++row_index) {
+      auto it_end_path      = it_start_path + max_depth;
+      auto it_padding_found = std::find(it_start_path, it_end_path, num_vertices);
+
+      v_sizes.push_back(std::distance(it_start_path, it_padding_found));
+
+      it_start_path = it_end_path;
+    }
+
+    // truncate padded vectors v_coalesced, w_coalesced:
+    //
+    v_coalesced.erase(std::remove(v_coalesced.begin(), v_coalesced.end(), num_vertices),
+                      v_coalesced.end());
+
+    w_coalesced.erase(std::remove(w_coalesced.begin(), w_coalesced.end(), weight_t{0}),
+                      w_coalesced.end());
+  }
+
+  auto it_v_begin = v_coalesced.begin();
+  auto it_w_begin = w_coalesced.begin();
+  for (auto&& crt_sz : v_sizes) {
+    auto it_v_end = it_v_begin + crt_sz;
+
+    bool test_path = host_check_path(v_ro, v_ci, v_vals, it_v_begin, it_v_end, it_w_begin);
+
+    it_v_begin = it_v_end;
+    it_w_begin += crt_sz - 1;
+
+    if (!test_path) {  // something went wrong; print to debug (since it's random)
+      raft::print_host_vector("sizes", v_sizes.data(), v_sizes.size(), std::cerr);
+
+      raft::print_host_vector("coalesced v", v_coalesced.data(), v_coalesced.size(), std::cerr);
+
+      raft::print_host_vector("coalesced w", w_coalesced.data(), w_coalesced.size(), std::cerr);
+
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename index_t>
+bool host_check_query_rw(raft::handle_t const& handle,
+                         vector_test_t<index_t> const& d_v_sizes,
+                         vector_test_t<index_t> const& d_v_offsets,
+                         vector_test_t<index_t> const& d_w_sizes,
+                         vector_test_t<index_t> const& d_w_offsets)
+{
+  index_t num_paths = d_v_sizes.size();
+
+  if (num_paths == 0) return false;
+
+  std::vector<index_t> v_sizes(num_paths);
+  std::vector<index_t> v_offsets(num_paths);
+  std::vector<index_t> w_sizes(num_paths);
+  std::vector<index_t> w_offsets(num_paths);
+
+  raft::update_host(v_sizes.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_v_sizes),
+                    num_paths,
+                    handle.get_stream());
+
+  raft::update_host(v_offsets.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_v_offsets),
+                    num_paths,
+                    handle.get_stream());
+
+  raft::update_host(w_sizes.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_w_sizes),
+                    num_paths,
+                    handle.get_stream());
+
+  raft::update_host(w_offsets.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_w_offsets),
+                    num_paths,
+                    handle.get_stream());
+
+  index_t crt_v_offset = 0;
+  index_t crt_w_offset = 0;
+  auto it_v_sz         = v_sizes.begin();
+  auto it_w_sz         = w_sizes.begin();
+  auto it_v_offset     = v_offsets.begin();
+  auto it_w_offset     = w_offsets.begin();
+
+  bool flag_passed{true};
+
+  for (; it_v_sz != v_sizes.end(); ++it_v_sz, ++it_w_sz, ++it_v_offset, ++it_w_offset) {
+    if (*it_w_sz != (*it_v_sz) - 1) {
+      std::cerr << "ERROR: Incorrect weight path size: " << *it_w_sz << ", " << *it_v_sz << '\n';
+      flag_passed = false;
+      break;
+    }
+
+    if (*it_v_offset != crt_v_offset) {
+      std::cerr << "ERROR: Incorrect vertex path offset: " << *it_v_offset << ", " << crt_v_offset
+                << '\n';
+      flag_passed = false;
+      break;
+    }
+
+    if (*it_w_offset != crt_w_offset) {
+      std::cerr << "ERROR: Incorrect weight path offset: " << *it_w_offset << ", " << crt_w_offset
+                << '\n';
+      flag_passed = false;
+      break;
+    }
+
+    crt_v_offset += *it_v_sz;
+    crt_w_offset += *it_w_sz;
+  }
+
+  if (!flag_passed) {
+    std::cerr << "v sizes:";
+    std::copy(v_sizes.begin(), v_sizes.end(), std::ostream_iterator<index_t>(std::cerr, ", "));
+    std::cerr << '\n';
+
+    std::cerr << "v offsets:";
+    std::copy(v_offsets.begin(), v_offsets.end(), std::ostream_iterator<index_t>(std::cerr, ", "));
+    std::cerr << '\n';
+
+    std::cerr << "w sizes:";
+    std::copy(w_sizes.begin(), w_sizes.end(), std::ostream_iterator<index_t>(std::cerr, ", "));
+    std::cerr << '\n';
+
+    std::cerr << "w offsets:";
+    std::copy(w_offsets.begin(), w_offsets.end(), std::ostream_iterator<index_t>(std::cerr, ", "));
+    std::cerr << '\n';
+  }
+
+  return flag_passed;
+}
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
new file mode 100644
index 00000000000..3b2779a5814
--- /dev/null
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -0,0 +1,1062 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "cuda_profiler_api.h"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/random.h>
+
+#include <cugraph/algorithms.hpp>
+#include <sampling/random_walks.cuh>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include "random_walks_utils.cuh"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <utilities/high_res_timer.hpp>
+#include <vector>
+
+using namespace cugraph::experimental;
+
+template <typename value_t>
+using vector_test_t = detail::device_vec_t<value_t>;  // for debug purposes
+
+namespace {  // anonym.
+
+template <typename vertex_t, typename edge_t, typename index_t>
+bool check_col_indices(raft::handle_t const& handle,
+                       vector_test_t<edge_t> const& d_crt_out_degs,
+                       vector_test_t<vertex_t> const& d_col_indx,
+                       index_t num_paths)
+{
+  bool all_indices_within_degs = thrust::all_of(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::make_counting_iterator<index_t>(0),
+    thrust::make_counting_iterator<index_t>(num_paths),
+    [p_d_col_indx     = detail::raw_const_ptr(d_col_indx),
+     p_d_crt_out_degs = detail::raw_const_ptr(d_crt_out_degs)] __device__(auto indx) {
+      if (p_d_crt_out_degs[indx] > 0)
+        return ((p_d_col_indx[indx] >= 0) && (p_d_col_indx[indx] < p_d_crt_out_degs[indx]));
+      else
+        return true;
+    });
+  return all_indices_within_degs;
+}
+
+}  // namespace
+
+// FIXME (per rlratzel request):
+// This test may be considered an e2e test
+// which could be moved to a different test suite:
+//
+struct RandomWalksPrimsTest : public ::testing::Test {
+};
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphRWStart)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vs(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, num_vertices + 1, handle.get_stream());
+  raft::update_host(v_ci.data(), indices, num_edges, handle.get_stream());
+  raft::update_host(v_vs.data(), values, num_edges, handle.get_stream());
+
+  std::vector<edge_t> v_ro_expected{0, 1, 3, 6, 7, 8, 8};
+  std::vector<vertex_t> v_ci_expected{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_vs_expected{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  EXPECT_EQ(v_ro, v_ro_expected);
+  EXPECT_EQ(v_ci, v_ci_expected);
+  EXPECT_EQ(v_vs, v_vs_expected);
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  std::vector<vertex_t> v_coalesced_exp{1, -1, -1, 0, -1, -1, 4, -1, -1, 2, -1, -1};
+  raft::update_host(
+    v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream());
+  EXPECT_EQ(v_coalesced, v_coalesced_exp);
+
+  std::vector<index_t> v_sizes{1, 1, 1, 1};
+  std::vector<index_t> v_sz_exp(num_paths);
+  raft::update_host(v_sz_exp.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream());
+
+  EXPECT_EQ(v_sizes, v_sz_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceExperiments)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+  EXPECT_EQ(static_cast<size_t>(num_vertices), d_out_degs.size());
+
+  std::vector<edge_t> v_out_degs(num_vertices);
+  raft::update_host(
+    v_out_degs.data(), raw_const_ptr(d_out_degs), num_vertices, handle.get_stream());
+
+  std::vector<edge_t> v_out_degs_exp{1, 2, 3, 1, 1, 0};
+  EXPECT_EQ(v_out_degs, v_out_degs_exp);
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  std::vector<edge_t> v_crt_out_degs(num_paths);
+  raft::update_host(
+    v_crt_out_degs.data(), raw_const_ptr(d_crt_out_degs), num_paths, handle.get_stream());
+
+  std::vector<edge_t> v_crt_out_degs_exp{2, 1, 1, 3};
+  EXPECT_EQ(v_crt_out_degs, v_crt_out_degs_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphColExtraction)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  col_indx_extract_t<decltype(graph_view), index_t> col_extractor{handle,
+                                                                  graph_view,
+                                                                  raw_const_ptr(d_crt_out_degs),
+                                                                  raw_const_ptr(d_sizes),
+                                                                  num_paths,
+                                                                  max_depth};
+
+  // typically given by random engine:
+  //
+  std::vector<vertex_t> v_col_indx{1, 0, 0, 2};
+  vector_test_t<vertex_t> d_col_indx(num_paths, handle.get_stream());
+
+  raft::update_device(d_col_indx.data(), v_col_indx.data(), d_col_indx.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(num_paths, handle.get_stream());
+  vector_test_t<weight_t> d_next_w(num_paths, handle.get_stream());
+
+  col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+  std::vector<vertex_t> v_next_v(num_paths);
+  std::vector<weight_t> v_next_w(num_paths);
+
+  raft::update_host(v_next_v.data(), raw_const_ptr(d_next_v), num_paths, handle.get_stream());
+  raft::update_host(v_next_w.data(), raw_const_ptr(d_next_w), num_paths, handle.get_stream());
+
+  std::vector<vertex_t> v_next_v_exp{4, 1, 5, 3};
+  std::vector<weight_t> v_next_w_exp{2.1f, 0.1f, 7.1f, 5.1f};
+
+  EXPECT_EQ(v_next_v, v_next_v_exp);
+  EXPECT_EQ(v_next_w, v_next_w_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphRndGenColIndx)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = float;
+  using seed_t   = long;
+
+  using random_engine_t = rrandom_gen_t<vertex_t, edge_t>;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  // random engine generated:
+  //
+  vector_test_t<vertex_t> d_col_indx(num_paths, handle.get_stream());
+  vector_test_t<real_t> d_random(num_paths, handle.get_stream());
+
+  seed_t seed = static_cast<seed_t>(std::time(nullptr));
+  random_engine_t rgen(handle, num_paths, d_random, d_crt_out_degs, seed);
+  rgen.generate_col_indices(d_col_indx);
+
+  bool all_indices_within_degs = check_col_indices(handle, d_crt_out_degs, d_col_indx, num_paths);
+
+  ASSERT_TRUE(all_indices_within_degs);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphUpdatePathSizes)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = float;
+  using seed_t   = long;
+
+  using random_engine_t = rrandom_gen_t<vertex_t, edge_t>;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // Fixed  set of out-degs, as opposed to have them generated by the algorithm.
+  // That's because I want to test a certain functionality in isolation
+  //
+  std::vector<edge_t> v_crt_out_degs{2, 0, 1, 0};
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  raft::update_device(
+    d_crt_out_degs.data(), v_crt_out_degs.data(), d_crt_out_degs.size(), handle.get_stream());
+
+  rand_walker.update_path_sizes(d_crt_out_degs, d_sizes);
+
+  std::vector<index_t> v_sizes(num_paths);
+  raft::update_host(v_sizes.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream());
+  std::vector<index_t> v_sizes_exp{2, 1, 2, 1};
+  // i.e., corresponding 0-entries in crt-out-degs, don't get updated;
+
+  EXPECT_EQ(v_sizes, v_sizes_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphScatterUpdate)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  col_indx_extract_t<decltype(graph_view), index_t> col_extractor{handle,
+                                                                  graph_view,
+                                                                  raw_const_ptr(d_crt_out_degs),
+                                                                  raw_const_ptr(d_sizes),
+                                                                  num_paths,
+                                                                  max_depth};
+
+  // typically given by random engine:
+  //
+  std::vector<vertex_t> v_col_indx{1, 0, 0, 2};
+  vector_test_t<vertex_t> d_col_indx(num_paths, handle.get_stream());
+
+  raft::update_device(d_col_indx.data(), v_col_indx.data(), d_col_indx.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(num_paths, handle.get_stream());
+  vector_test_t<weight_t> d_next_w(num_paths, handle.get_stream());
+
+  col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+  rand_walker.update_path_sizes(d_crt_out_degs, d_sizes);
+
+  // check start():
+  //
+  {
+    std::vector<vertex_t> v_coalesced_exp{1, -1, -1, 0, -1, -1, 4, -1, -1, 2, -1, -1};
+    raft::update_host(
+      v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream());
+    EXPECT_EQ(v_coalesced, v_coalesced_exp);
+  }
+
+  // check crt_out_degs:
+  //
+  {
+    std::vector<edge_t> v_crt_out_degs(num_paths);
+    raft::update_host(
+      v_crt_out_degs.data(), raw_const_ptr(d_crt_out_degs), num_paths, handle.get_stream());
+    std::vector<edge_t> v_crt_out_degs_exp{2, 1, 1, 3};
+    EXPECT_EQ(v_crt_out_degs, v_crt_out_degs_exp);
+  }
+
+  // check paths sizes update:
+  //
+  {
+    std::vector<index_t> v_sizes(num_paths);
+    raft::update_host(v_sizes.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream());
+    std::vector<index_t> v_sizes_exp{2, 2, 2, 2};
+    // i.e., corresponding 0-entries in crt-out-degs, don't get updated;
+    EXPECT_EQ(v_sizes, v_sizes_exp);
+  }
+
+  // check next step:
+  //
+  {
+    std::vector<vertex_t> v_next_v(num_paths);
+    std::vector<weight_t> v_next_w(num_paths);
+
+    raft::update_host(v_next_v.data(), raw_const_ptr(d_next_v), num_paths, handle.get_stream());
+    raft::update_host(v_next_w.data(), raw_const_ptr(d_next_w), num_paths, handle.get_stream());
+
+    std::vector<vertex_t> v_next_v_exp{4, 1, 5, 3};
+    std::vector<weight_t> v_next_w_exp{2.1f, 0.1f, 7.1f, 5.1f};
+
+    EXPECT_EQ(v_next_v, v_next_v_exp);
+    EXPECT_EQ(v_next_w, v_next_w_exp);
+  }
+
+  rand_walker.scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_sizes);
+  rand_walker.scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_sizes);
+
+  // check vertex/weight scatter:
+  //
+  {
+    raft::update_host(
+      v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream());
+    raft::update_host(
+      w_coalesced.data(), raw_const_ptr(d_coalesced_w), total_sz - num_paths, handle.get_stream());
+
+    std::vector<vertex_t> v_coalesced_exp{1, 4, -1, 0, 1, -1, 4, 5, -1, 2, 3, -1};
+    std::vector<weight_t> w_coalesced_exp{2.1, -1, 0.1, -1, 7.1, -1, 5.1, -1};
+
+    EXPECT_EQ(v_coalesced, v_coalesced_exp);
+    EXPECT_EQ(w_coalesced, w_coalesced_exp);
+  }
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceDefragment)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<index_t> v_sizes{1, 2, 2, 1};
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+  raft::update_device(d_sizes.data(), v_sizes.data(), d_sizes.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  v_coalesced[0]                 = 3;
+  v_coalesced[max_depth]         = 5;
+  v_coalesced[max_depth + 1]     = 2;
+  v_coalesced[2 * max_depth]     = 4;
+  v_coalesced[2 * max_depth + 1] = 0;
+  v_coalesced[3 * max_depth]     = 1;
+
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+  w_coalesced[max_depth - 1]     = 10.1;
+  w_coalesced[2 * max_depth - 2] = 11.2;
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  rand_walker.stop(d_coalesced_v, d_coalesced_w, d_sizes);
+
+  // check vertex/weight defragment:
+  //
+  {
+    v_coalesced.resize(d_coalesced_v.size());
+    w_coalesced.resize(d_coalesced_w.size());
+
+    raft::update_host(
+      v_coalesced.data(), raw_const_ptr(d_coalesced_v), d_coalesced_v.size(), handle.get_stream());
+    raft::update_host(
+      w_coalesced.data(), raw_const_ptr(d_coalesced_w), d_coalesced_w.size(), handle.get_stream());
+
+    std::vector<vertex_t> v_coalesced_exp{3, 5, 2, 4, 0, 1};
+    std::vector<weight_t> w_coalesced_exp{10.1, 11.2};
+
+    EXPECT_EQ(v_coalesced, v_coalesced_exp);
+    EXPECT_EQ(w_coalesced, w_coalesced_exp);
+  }
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+  auto& d_coalesced_v = std::get<0>(quad);
+  auto& d_coalesced_w = std::get<1>(quad);
+  auto& d_sizes       = std::get<2>(quad);
+  auto seed0          = std::get<3>(quad);
+
+  bool test_all_paths =
+    cugraph::test::host_check_rw_paths(handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes);
+
+  if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_all_paths);
+}
+
+TEST(RandomWalksQuery, GraphRWQueryOffsets)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+  auto& d_v_sizes = std::get<2>(quad);
+  auto seed0      = std::get<3>(quad);
+
+  auto triplet = query_rw_sizes_offsets(handle, num_paths, detail::raw_const_ptr(d_v_sizes));
+
+  auto& d_v_offsets = std::get<0>(triplet);
+  auto& d_w_sizes   = std::get<1>(triplet);
+  auto& d_w_offsets = std::get<2>(triplet);
+
+  bool test_paths_sz =
+    cugraph::test::host_check_query_rw(handle, d_v_sizes, d_v_offsets, d_w_sizes, d_w_offsets);
+
+  if (!test_paths_sz) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_paths_sz);
+}
+
+TEST(RandomWalksSpecialCase, SingleRandomWalk)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+  auto& d_coalesced_v = std::get<0>(quad);
+  auto& d_coalesced_w = std::get<1>(quad);
+  auto& d_sizes       = std::get<2>(quad);
+  auto seed0          = std::get<3>(quad);
+
+  bool test_all_paths =
+    cugraph::test::host_check_rw_paths(handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes);
+
+  if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_all_paths);
+}
+
+TEST(RandomWalksSpecialCase, UnweightedGraph)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+
+  auto graph = cugraph::test::make_graph<vertex_t, edge_t, weight_t>(
+    handle, v_src, v_dst, std::nullopt, num_vertices, num_edges);  // un-weighted
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  ASSERT_TRUE(graph_view.get_matrix_partition_view().get_weights().has_value() == false);
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+  auto& d_coalesced_v = std::get<0>(quad);
+  auto& d_coalesced_w = std::get<1>(quad);
+  auto& d_sizes       = std::get<2>(quad);
+  auto seed0          = std::get<3>(quad);
+
+  bool test_all_paths =
+    cugraph::test::host_check_rw_paths(handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes);
+
+  if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_all_paths);
+}
+
+TEST(RandomWalksPadded, SimpleGraph)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.get_matrix_partition_view().get_offsets();
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+  weight_t const* values  = *(graph_view.get_matrix_partition_view().get_weights());
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  bool use_padding{true};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth, use_padding);
+
+  auto& d_coalesced_v = std::get<0>(quad);
+  auto& d_coalesced_w = std::get<1>(quad);
+  auto& d_sizes       = std::get<2>(quad);
+  auto seed0          = std::get<3>(quad);
+
+  ASSERT_TRUE(d_sizes.size() == 0);
+
+  bool test_all_paths = cugraph::test::host_check_rw_paths(
+    handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes, num_paths);
+
+  if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_all_paths);
+}
+
+TEST(RandomWalksUtility, PathsToCOO)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  std::vector<index_t> v_sizes{2, 1, 3, 5, 1};
+  std::vector<vertex_t> v_coalesced{5, 3, 4, 9, 0, 1, 6, 2, 7, 3, 2, 5};
+  std::vector<weight_t> w_coalesced{0.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto num_paths = v_sizes.size();
+  auto total_sz  = v_coalesced.size();
+  auto num_edges = w_coalesced.size();
+
+  ASSERT_TRUE(num_edges == total_sz - num_paths);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(d_sizes.data(), v_sizes.data(), d_sizes.size(), handle.get_stream());
+
+  index_t coalesced_v_sz = d_coalesced_v.size();
+
+  auto tpl_coo_offsets = convert_paths_to_coo<vertex_t>(handle,
+                                                        coalesced_v_sz,
+                                                        static_cast<index_t>(num_paths),
+                                                        d_coalesced_v.release(),
+                                                        d_sizes.release());
+
+  auto&& d_src     = std::move(std::get<0>(tpl_coo_offsets));
+  auto&& d_dst     = std::move(std::get<1>(tpl_coo_offsets));
+  auto&& d_offsets = std::move(std::get<2>(tpl_coo_offsets));
+
+  ASSERT_TRUE(d_src.size() == num_edges);
+  ASSERT_TRUE(d_dst.size() == num_edges);
+
+  std::vector<vertex_t> v_src(num_edges, 0);
+  std::vector<vertex_t> v_dst(num_edges, 0);
+  std::vector<index_t> v_offsets(d_offsets.size(), 0);
+
+  raft::update_host(v_src.data(), raw_const_ptr(d_src), d_src.size(), handle.get_stream());
+  raft::update_host(v_dst.data(), raw_const_ptr(d_dst), d_dst.size(), handle.get_stream());
+  raft::update_host(
+    v_offsets.data(), raw_const_ptr(d_offsets), d_offsets.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_src_exp{5, 9, 0, 6, 2, 7, 3};
+  std::vector<vertex_t> v_dst_exp{3, 0, 1, 2, 7, 3, 2};
+  std::vector<index_t> v_offsets_exp{0, 1, 3};
+
+  EXPECT_EQ(v_src, v_src_exp);
+  EXPECT_EQ(v_dst, v_dst_exp);
+  EXPECT_EQ(v_offsets, v_offsets_exp);
+}
diff --git a/cpp/tests/serialization/un_serialize_test.cpp b/cpp/tests/serialization/un_serialize_test.cpp
new file mode 100644
index 00000000000..e65d37fd77a
--- /dev/null
+++ b/cpp/tests/serialization/un_serialize_test.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <gtest/gtest.h>
+#include "cuda_profiler_api.h"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/device_uvector.hpp>
+
+#include <cugraph/serialization/serializer.hpp>
+
+TEST(SerializationTest, GraphSerUnser)
+{
+  using namespace cugraph::serializer;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto pair_sz      = serializer_t::get_device_graph_sz_bytes(graph);
+  auto total_ser_sz = pair_sz.first + pair_sz.second;
+
+  serializer_t ser(handle, total_ser_sz);
+  serializer_t::graph_meta_t<decltype(graph)> graph_meta{};
+  ser.serialize(graph, graph_meta);
+
+  pair_sz          = serializer_t::get_device_graph_sz_bytes(graph_meta);
+  auto post_ser_sz = pair_sz.first + pair_sz.second;
+
+  EXPECT_EQ(total_ser_sz, post_ser_sz);
+
+  auto graph_copy = ser.unserialize<decltype(graph)>(pair_sz.first, pair_sz.second);
+
+  auto pair = cugraph::test::compare_graphs(handle, graph, graph_copy);
+  if (pair.first == false) std::cerr << "Test failed with " << pair.second << ".\n";
+
+  ASSERT_TRUE(pair.first);
+}
+
+TEST(SerializationTest, GraphDecoupledSerUnser)
+{
+  using namespace cugraph::serializer;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = double;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  auto pair_sz      = serializer_t::get_device_graph_sz_bytes(graph);
+  auto total_ser_sz = pair_sz.first + pair_sz.second;
+
+  // use the following buffer to simulate communication between
+  // sender and reciever of the serialization:
+  //
+  rmm::device_uvector<serializer_t::byte_t> d_storage_comm(0, handle.get_stream());
+
+  {
+    serializer_t ser(handle, total_ser_sz);
+    serializer_t::graph_meta_t<decltype(graph)> graph_meta{};
+    ser.serialize(graph, graph_meta);
+
+    pair_sz          = serializer_t::get_device_graph_sz_bytes(graph_meta);
+    auto post_ser_sz = pair_sz.first + pair_sz.second;
+
+    EXPECT_EQ(total_ser_sz, post_ser_sz);
+
+    d_storage_comm.resize(total_ser_sz, handle.get_stream());
+    raft::copy(d_storage_comm.data(), ser.get_storage(), total_ser_sz, handle.get_stream());
+  }
+
+  {
+    serializer_t ser(handle, d_storage_comm.data());
+
+    auto graph_copy = ser.unserialize<decltype(graph)>(pair_sz.first, pair_sz.second);
+
+    auto pair = cugraph::test::compare_graphs(handle, graph, graph_copy);
+    if (pair.first == false) std::cerr << "Test failed with " << pair.second << ".\n";
+
+    ASSERT_TRUE(pair.first);
+  }
+}
+
+TEST(SerializationTest, UnweightedGraphDecoupledSerUnser)
+{
+  using namespace cugraph::serializer;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = double;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+
+  auto graph = cugraph::test::make_graph<vertex_t, edge_t, weight_t>(
+    handle, v_src, v_dst, std::nullopt, num_vertices, num_edges);
+
+  ASSERT_TRUE(graph.view().get_matrix_partition_view().get_weights().has_value() == false);
+
+  auto pair_sz      = serializer_t::get_device_graph_sz_bytes(graph);
+  auto total_ser_sz = pair_sz.first + pair_sz.second;
+
+  // use the following buffer to simulate communication between
+  // sender and reciever of the serialization:
+  //
+  rmm::device_uvector<serializer_t::byte_t> d_storage_comm(0, handle.get_stream());
+
+  {
+    serializer_t ser(handle, total_ser_sz);
+    serializer_t::graph_meta_t<decltype(graph)> graph_meta{};
+    ser.serialize(graph, graph_meta);
+
+    pair_sz          = serializer_t::get_device_graph_sz_bytes(graph_meta);
+    auto post_ser_sz = pair_sz.first + pair_sz.second;
+
+    EXPECT_EQ(total_ser_sz, post_ser_sz);
+
+    d_storage_comm.resize(total_ser_sz, handle.get_stream());
+    raft::copy(d_storage_comm.data(), ser.get_storage(), total_ser_sz, handle.get_stream());
+  }
+
+  {
+    serializer_t ser(handle, d_storage_comm.data());
+
+    auto graph_copy = ser.unserialize<decltype(graph)>(pair_sz.first, pair_sz.second);
+
+    ASSERT_TRUE(graph_copy.view().get_matrix_partition_view().get_weights().has_value() == false);
+
+    auto pair = cugraph::test::compare_graphs(handle, graph, graph_copy);
+    if (pair.first == false) std::cerr << "Test failed with " << pair.second << ".\n";
+
+    ASSERT_TRUE(pair.first);
+  }
+}
diff --git a/cpp/tests/traversal/bfs_ref.h b/cpp/tests/traversal/bfs_ref.h
index a32b2f99787..5efdce818e7 100644
--- a/cpp/tests/traversal/bfs_ref.h
+++ b/cpp/tests/traversal/bfs_ref.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <vector>
 
 template <typename VT, typename ET>
-void populate_neighbors(VT *indices, ET *offsets, VT w, std::vector<VT> &neighbors)
+void populate_neighbors(VT* indices, ET* offsets, VT w, std::vector<VT>& neighbors)
 {
   ET edge_start = offsets[w];
   ET edge_end   = offsets[w + 1];
@@ -31,14 +31,14 @@ void populate_neighbors(VT *indices, ET *offsets, VT w, std::vector<VT> &neighbo
 
 // This implements the BFS based on (Brandes, 2001) for shortest path counting
 template <typename VT, typename ET>
-void ref_bfs(VT *indices,
-             ET *offsets,
+void ref_bfs(VT* indices,
+             ET* offsets,
              VT const number_of_vertices,
-             std::queue<VT> &Q,
-             std::stack<VT> &S,
-             std::vector<VT> &dist,
-             std::vector<std::vector<VT>> &pred,
-             std::vector<double> &sigmas,
+             std::queue<VT>& Q,
+             std::stack<VT>& S,
+             std::vector<VT>& dist,
+             std::vector<std::vector<VT>>& pred,
+             std::vector<double>& sigmas,
              VT source)
 {
   std::vector<VT> neighbors;
diff --git a/cpp/tests/traversal/bfs_test.cu b/cpp/tests/traversal/bfs_test.cu
index d90da4367a0..b0da605a0a0 100644
--- a/cpp/tests/traversal/bfs_test.cu
+++ b/cpp/tests/traversal/bfs_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
-#include <algorithms.hpp>
+#include <cugraph/algorithms.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -46,7 +46,7 @@
 // C++ Reference Implementation
 // ============================================================================
 template <typename T, typename precision_t>
-bool compare_close(const T &a, const T &b, const precision_t epsilon, precision_t zero_threshold)
+bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
 {
   return ((zero_threshold > a && zero_threshold > b)) ||
          (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
@@ -59,9 +59,9 @@ typedef struct BFS_Usecase_t {
   std::string config_;     // Path to graph file
   std::string file_path_;  // Complete path to graph using dataset_root_dir
   int source_;             // Starting point from the traversal
-  BFS_Usecase_t(const std::string &config, int source) : config_(config), source_(source)
+  BFS_Usecase_t(const std::string& config, int source) : config_(config), source_(source)
   {
-    const std::string &rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
+    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
     if ((config_ != "") && (config_[0] != '/')) {
       file_path_ = rapidsDatasetRootDir + "/" + config_;
     } else {
@@ -86,7 +86,7 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
   // WT                 edge weight data type
   // return_sp_counter  should BFS return shortest path countner
   template <typename VT, typename ET, typename WT, bool return_sp_counter>
-  void run_current_test(const BFS_Usecase &configuration)
+  void run_current_test(const BFS_Usecase& configuration)
   {
     // Step 1: Construction of the graph based on configuration
     VT number_of_vertices;
@@ -95,8 +95,8 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
     auto csr =
       cugraph::test::generate_graph_csr_from_mm<VT, ET, WT>(directed, configuration.file_path_);
     cudaDeviceSynchronize();
-    cugraph::GraphCSRView<VT, ET, WT> G = csr->view();
-    G.prop.directed                     = directed;
+    cugraph::legacy::GraphCSRView<VT, ET, WT> G = csr->view();
+    G.prop.directed                             = directed;
 
     ASSERT_TRUE(configuration.source_ >= 0 && (VT)configuration.source_ < G.number_of_vertices)
       << "Starting sources should be >= 0 and"
@@ -174,7 +174,7 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
       // that the predecessor obtained with the GPU implementation is one of the
       // predecessors obtained during the C++ BFS traversal
       VT pred = cugraph_pred[i];  // It could be equal to -1 if the node is never reached
-      constexpr VT invalid_vid = cugraph::invalid_vertex_id<VT>::value;
+      constexpr VT invalid_vid = cugraph::legacy::invalid_vertex_id<VT>::value;
       if (pred == invalid_vid) {
         EXPECT_TRUE(ref_bfs_pred[i].empty())
           << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph had not predecessor,"
@@ -224,13 +224,13 @@ TEST_P(Tests_BFS, CheckInt64_SP_COUNTER)
   run_current_test<int64_t, int64_t, float, true>(GetParam());
 }
 
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_BFS,
-                        ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
-                                          BFS_Usecase("test/datasets/polbooks.mtx", 0),
-                                          BFS_Usecase("test/datasets/netscience.mtx", 0),
-                                          BFS_Usecase("test/datasets/netscience.mtx", 100),
-                                          BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
-                                          BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_BFS,
+                         ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
+                                           BFS_Usecase("test/datasets/polbooks.mtx", 0),
+                                           BFS_Usecase("test/datasets/netscience.mtx", 0),
+                                           BFS_Usecase("test/datasets/netscience.mtx", 100),
+                                           BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
+                                           BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/sssp_test.cu b/cpp/tests/traversal/sssp_test.cu
index ea56d1d79cb..e221e7e3445 100644
--- a/cpp/tests/traversal/sssp_test.cu
+++ b/cpp/tests/traversal/sssp_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -13,9 +13,9 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
-#include <algorithms.hpp>
 #include <converters/COOtoCSR.cuh>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <thrust/fill.h>
 
@@ -255,14 +255,14 @@ class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
       ASSERT_TRUE(0);
     }
 
-    cugraph::GraphCOOView<MaxVType, MaxEType, DistType> G_coo(
+    cugraph::legacy::GraphCOOView<MaxVType, MaxEType, DistType> G_coo(
       &cooRowInd[0],
       &cooColInd[0],
       (DoRandomWeights ? &cooVal[0] : nullptr),
       num_vertices,
       num_edges);
-    auto G_unique                                         = cugraph::coo_to_csr(G_coo);
-    cugraph::GraphCSRView<MaxVType, MaxEType, DistType> G = G_unique->view();
+    auto G_unique                                                 = cugraph::coo_to_csr(G_coo);
+    cugraph::legacy::GraphCSRView<MaxVType, MaxEType, DistType> G = G_unique->view();
     cudaDeviceSynchronize();
 
     std::vector<DistType> dist_vec;
@@ -425,10 +425,10 @@ TEST_P(Tests_SSSP, CheckFP64_RANDOM_DIST_PREDS)
 
 // --gtest_filter=*simple_test*
 
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_SSSP,
-                        ::testing::Values(SSSP_Usecase(MTX, "test/datasets/dblp.mtx", 100),
-                                          SSSP_Usecase(MTX, "test/datasets/wiki2003.mtx", 100000),
-                                          SSSP_Usecase(MTX, "test/datasets/karate.mtx", 1)));
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_SSSP,
+                         ::testing::Values(SSSP_Usecase(MTX, "test/datasets/dblp.mtx", 100),
+                                           SSSP_Usecase(MTX, "test/datasets/wiki2003.mtx", 100000),
+                                           SSSP_Usecase(MTX, "test/datasets/karate.mtx", 1)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/tsp_test.cu b/cpp/tests/traversal/tsp_test.cu
new file mode 100644
index 00000000000..e00f2949af5
--- /dev/null
+++ b/cpp/tests/traversal/tsp_test.cu
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// TSP solver tests
+// Author: Hugo Linsenmaier hlinsenmaier@nvidia.com
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// TSP solver tests
+// Author: Hugo Linsenmaier hlinsenmaier@nvidia.com
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
+
+#include <cuda_profiler_api.h>
+
+#include <raft/error.hpp>
+#include <raft/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <fstream>
+#include <set>
+#include <vector>
+
+typedef struct Tsp_Usecase_t {
+  std::string tsp_file;
+  float ref_cost;
+  Tsp_Usecase_t(const std::string& a, const float c)
+  {
+    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
+    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
+    if ((a != "") && (a[0] != '/')) {
+      tsp_file = rapidsDatasetRootDir + "/" + a;
+    } else {
+      tsp_file = a;
+    }
+    ref_cost = c;
+  }
+  Tsp_Usecase_t& operator=(const Tsp_Usecase_t& rhs)
+  {
+    tsp_file = rhs.tsp_file;
+    ref_cost = rhs.ref_cost;
+    return *this;
+  }
+} Tsp_Usecase;
+
+static std::vector<Tsp_Usecase_t> euc_2d{
+  {"tsplib/datasets/a280.tsp", 2579},      {"tsplib/datasets/berlin52.tsp", 7542},
+  {"tsplib/datasets/bier127.tsp", 118282}, {"tsplib/datasets/ch130.tsp", 6110},
+  {"tsplib/datasets/ch150.tsp", 6528},     {"tsplib/datasets/d1291.tsp", 50801},
+  {"tsplib/datasets/d1655.tsp", 62128},    {"tsplib/datasets/d198.tsp", 15780},
+  {"tsplib/datasets/d2103.tsp", 80450},    {"tsplib/datasets/d493.tsp", 35002},
+  {"tsplib/datasets/d657.tsp", 48912},     {"tsplib/datasets/eil101.tsp", 629},
+  {"tsplib/datasets/eil51.tsp", 426},      {"tsplib/datasets/eil76.tsp", 538},
+  {"tsplib/datasets/fl1400.tsp", 20127},   {"tsplib/datasets/fl1577.tsp", 22249},
+  {"tsplib/datasets/fl417.tsp", 11861},    {"tsplib/datasets/gil262.tsp", 2378},
+  {"tsplib/datasets/kroA100.tsp", 21282},  {"tsplib/datasets/kroA150.tsp", 26524},
+  {"tsplib/datasets/kroA200.tsp", 29368},  {"tsplib/datasets/kroB100.tsp", 22141},
+  {"tsplib/datasets/kroB150.tsp", 26130},  {"tsplib/datasets/kroB200.tsp", 29437},
+  {"tsplib/datasets/kroC100.tsp", 20749},  {"tsplib/datasets/kroD100.tsp", 21294},
+  {"tsplib/datasets/kroE100.tsp", 22068},  {"tsplib/datasets/lin105.tsp", 14379},
+  {"tsplib/datasets/lin318.tsp", 42029},   {"tsplib/datasets/nrw1379.tsp", 56638},
+  {"tsplib/datasets/p654.tsp", 34643},     {"tsplib/datasets/pcb1173.tsp", 56892},
+  {"tsplib/datasets/pcb442.tsp", 50778},   {"tsplib/datasets/pr1002.tsp", 259045},
+  {"tsplib/datasets/pr107.tsp", 44303},    {"tsplib/datasets/pr136.tsp", 96772},
+  {"tsplib/datasets/pr144.tsp", 58537},    {"tsplib/datasets/pr152.tsp", 73682},
+  {"tsplib/datasets/pr226.tsp", 80369},    {"tsplib/datasets/pr264.tsp", 49135},
+  {"tsplib/datasets/pr299.tsp", 48191},    {"tsplib/datasets/pr439.tsp", 107217},
+  {"tsplib/datasets/pr76.tsp", 108159},    {"tsplib/datasets/rat195.tsp", 2323},
+  {"tsplib/datasets/rat575.tsp", 6773},    {"tsplib/datasets/rat783.tsp", 8806},
+  {"tsplib/datasets/rat99.tsp", 1211},     {"tsplib/datasets/rd100.tsp", 7910},
+  {"tsplib/datasets/rd400.tsp", 15281},    {"tsplib/datasets/rl1323.tsp", 270199},
+  {"tsplib/datasets/st70.tsp", 675},       {"tsplib/datasets/ts225.tsp", 126643},
+  {"tsplib/datasets/tsp225.tsp", 3916},    {"tsplib/datasets/u1060.tsp", 224094},
+  {"tsplib/datasets/u1432.tsp", 152970},   {"tsplib/datasets/u159.tsp", 42080},
+  {"tsplib/datasets/u574.tsp", 36905},     {"tsplib/datasets/u724.tsp", 41910},
+  {"tsplib/datasets/vm1084.tsp", 239297},
+};
+
+struct Route {
+  std::vector<int> cities;
+  std::vector<float> x_pos;
+  std::vector<float> y_pos;
+};
+
+class Tests_Tsp : public ::testing::TestWithParam<Tsp_Usecase> {
+ public:
+  Tests_Tsp() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  void run_current_test(const Tsp_Usecase& param)
+  {
+    const ::testing::TestInfo* const test_info =
+      ::testing::UnitTest::GetInstance()->current_test_info();
+    std::stringstream ss;
+    std::string test_id = std::string(test_info->test_case_name()) + std::string(".") +
+                          std::string(test_info->name()) + std::string("_") +
+                          cugraph::test::getFileName(param.tsp_file) + std::string("_") +
+                          ss.str().c_str();
+
+    float tol = 1E-1f;
+    HighResClock hr_clock;
+    double time_tmp;
+    Route input;
+
+    std::cout << "File: " << param.tsp_file.c_str() << "\n";
+    int nodes = load_tsp(param.tsp_file.c_str(), &input);
+
+    // Device alloc
+    raft::handle_t const handle;
+    auto stream = handle.get_stream();
+    rmm::device_uvector<int> vertices(static_cast<size_t>(nodes), stream);
+    rmm::device_uvector<int> route(static_cast<size_t>(nodes), stream);
+    rmm::device_uvector<float> x_pos(static_cast<size_t>(nodes), stream);
+    rmm::device_uvector<float> y_pos(static_cast<size_t>(nodes), stream);
+
+    int* vtx_ptr   = vertices.data();
+    int* d_route   = route.data();
+    float* d_x_pos = x_pos.data();
+    float* d_y_pos = y_pos.data();
+
+    CUDA_TRY(cudaMemcpy(vtx_ptr, input.cities.data(), sizeof(int) * nodes, cudaMemcpyHostToDevice));
+    CUDA_TRY(
+      cudaMemcpy(d_x_pos, input.x_pos.data(), sizeof(float) * nodes, cudaMemcpyHostToDevice));
+    CUDA_TRY(
+      cudaMemcpy(d_y_pos, input.y_pos.data(), sizeof(float) * nodes, cudaMemcpyHostToDevice));
+
+    // Default parameters
+    int restarts     = 4096;
+    bool beam_search = true;
+    int k            = 4;
+    int nstart       = 0;
+    bool verbose     = false;
+
+    hr_clock.start();
+    cudaDeviceSynchronize();
+    cudaProfilerStart();
+
+    float final_cost = cugraph::traveling_salesperson(
+      handle, vtx_ptr, d_x_pos, d_y_pos, nodes, restarts, beam_search, k, nstart, verbose, d_route);
+    cudaProfilerStop();
+    cudaDeviceSynchronize();
+    hr_clock.stop(&time_tmp);
+
+    std::vector<int> h_route;
+    h_route.resize(nodes);
+    std::vector<int> h_vertices;
+    h_vertices.resize(nodes);
+    CUDA_TRY(cudaMemcpy(h_route.data(), d_route, sizeof(int) * nodes, cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+    CUDA_TRY(cudaMemcpy(h_vertices.data(), vtx_ptr, sizeof(int) * nodes, cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+
+    std::cout << "tsp_time: " << time_tmp << " us" << std::endl;
+    std::cout << "Ref cost is: " << param.ref_cost << "\n";
+    std::cout << "Final cost is: " << final_cost << "\n";
+    float err = fabs(final_cost - param.ref_cost);
+    err /= param.ref_cost;
+    std::cout << "Approximation error is: " << err * 100 << "%\n";
+    EXPECT_LE(err, tol);
+
+    // Check route goes through each vertex once
+    size_t u_nodes = nodes;
+    std::set<int> node_set(h_route.begin(), h_route.end());
+    ASSERT_EQ(node_set.size(), u_nodes);
+
+    // Bound check
+    int max = *std::max_element(h_vertices.begin(), h_vertices.end());
+    int min = *std::min_element(h_vertices.begin(), h_vertices.end());
+    EXPECT_GE(*node_set.begin(), min);
+    EXPECT_LE(*node_set.rbegin(), max);
+  }
+
+ private:
+  std::vector<std::string> split(const std::string& s, char delimiter)
+  {
+    std::vector<std::string> tokens;
+    std::string token;
+    std::istringstream tokenStream(s);
+    while (std::getline(tokenStream, token, delimiter)) {
+      if (token.size() == 0) continue;
+      tokens.push_back(token);
+    }
+    return tokens;
+  }
+
+  // FIXME: At the moment TSP does not accept a graph_t as input and therefore
+  // deviates from the standard testing I/O pattern. Once other input types
+  // are supported we want to reconcile TSP testing with the rest of cugraph.
+  int load_tsp(const char* fname, Route* input)
+  {
+    std::fstream fs;
+    fs.open(fname);
+    std::string line;
+    std::vector<std::string> tokens;
+    int nodes = 0;
+    while (std::getline(fs, line) && line.find(':') != std::string::npos) {
+      tokens           = split(line, ':');
+      auto strip_token = split(tokens[0], ' ')[0];
+      if (strip_token == "DIMENSION") nodes = std::stof(tokens[1]);
+    }
+
+    while (std::getline(fs, line) && line.find(' ') != std::string::npos) {
+      tokens       = split(line, ' ');
+      auto city_id = std::stof(tokens[0]);
+      auto x       = std::stof(tokens[1]);
+      auto y       = std::stof(tokens[2]);
+      input->cities.push_back(city_id);
+      input->x_pos.push_back(x);
+      input->y_pos.push_back(y);
+    }
+    fs.close();
+    assert(nodes == input->cities.size());
+    return nodes;
+  }
+};
+
+TEST_P(Tests_Tsp, CheckFP32_T) { run_current_test(GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(simple_test, Tests_Tsp, ::testing::ValuesIn(euc_2d));
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/tree/mst_test.cu b/cpp/tests/tree/mst_test.cu
index 949d6bae59b..ffbddd96eb0 100644
--- a/cpp/tests/tree/mst_test.cu
+++ b/cpp/tests/tree/mst_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
-#include <algorithms.hpp>
-#include <graph.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
 
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
@@ -105,13 +105,14 @@ class Tests_Mst : public ::testing::TestWithParam<Mst_Usecase> {
     raft::handle_t handle;
 
     std::cout << std::endl;
-    cugraph::GraphCOOView<int, int, T> G_coo(&cooRowInd[0], &cooColInd[0], &cooVal[0], m, nnz);
+    cugraph::legacy::GraphCOOView<int, int, T> G_coo(
+      &cooRowInd[0], &cooColInd[0], &cooVal[0], m, nnz);
     auto G_unique = cugraph::coo_to_csr(G_coo);
-    cugraph::GraphCSRView<int, int, T> G(G_unique->view().offsets,
-                                         G_unique->view().indices,
-                                         G_unique->view().edge_data,
-                                         G_unique->view().number_of_vertices,
-                                         G_unique->view().number_of_edges);
+    cugraph::legacy::GraphCSRView<int, int, T> G(G_unique->view().offsets,
+                                                 G_unique->view().indices,
+                                                 G_unique->view().edge_data,
+                                                 G_unique->view().number_of_vertices,
+                                                 G_unique->view().number_of_edges);
 
     cudaDeviceSynchronize();
 
@@ -144,8 +145,8 @@ TEST_P(Tests_Mst, CheckFP32_T) { run_current_test<float>(GetParam()); }
 
 TEST_P(Tests_Mst, CheckFP64_T) { run_current_test<double>(GetParam()); }
 
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_Mst,
-                        ::testing::Values(Mst_Usecase("test/datasets/netscience.mtx")));
+INSTANTIATE_TEST_SUITE_P(simple_test,
+                         Tests_Mst,
+                         ::testing::Values(Mst_Usecase("test/datasets/netscience.mtx")));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index 535b4b9c79e..75570b2c467 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <cugraph/utilities/error.hpp>
 #include <utilities/cxxopts.hpp>
-#include <utilities/error.hpp>
 
 #include <gtest/gtest.h>
 
@@ -32,23 +32,28 @@
 namespace cugraph {
 namespace test {
 
+// FIXME: The BaseFixture class is not used in any tests. This file is only needed for the
+// CUGRAPH_TEST_PROGRAM_MAIN macro and the code that it calls, so consider removing the BaseFixture
+// class and renaming this file, or moving CUGRAPH_TEST_PROGRAM_MAIN to the test_utilities.hpp file
+// and removing this file completely.
+
 /**
- * @brief Base test fixture class from which all libcudf tests should inherit.
+ * @brief Base test fixture class from which all libcugraph tests should inherit.
  *
  * Example:
  * ```
- * class MyTestFixture : public cudf::test::BaseFixture {};
+ * class MyTestFixture : public cugraph::test::BaseFixture {};
  * ```
  **/
 class BaseFixture : public ::testing::Test {
-  rmm::mr::device_memory_resource *_mr{rmm::mr::get_current_device_resource()};
+  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
-   * @brief Returns pointer to `device_memory_resource` that should be used for
-   * all tests inheriting from this fixture
+   * @brief Returns pointer to `device_memory_resource` that should be used for all tests inheriting
+   *from this fixture
    **/
-  rmm::mr::device_memory_resource *mr() { return _mr; }
+  rmm::mr::device_memory_resource* mr() { return _mr; }
 };
 
 /// MR factory functions
@@ -71,27 +76,26 @@ inline auto make_binning()
 }
 
 /**
- * @brief Creates a memory resource for the unit test environment
- * given the name of the allocation mode.
+ * @brief Creates a memory resource for the unit test environment given the name of the allocation
+ * mode.
  *
- * The returned resource instance must be kept alive for the duration of
- * the tests. Attaching the resource to a TestEnvironment causes
- * issues since the environment objects are not destroyed until
+ * The returned resource instance must be kept alive for the duration of the tests. Attaching the
+ * resource to a TestEnvironment causes issues since the environment objects are not destroyed until
  * after the runtime is shutdown.
  *
- * @throw cudf::logic_error if the `allocation_mode` is unsupported.
+ * @throw cugraph::logic_error if the `allocation_mode` is unsupported.
  *
  * @param allocation_mode String identifies which resource type.
  *        Accepted types are "pool", "cuda", and "managed" only.
  * @return Memory resource instance
  */
 inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
-  std::string const &allocation_mode)
+  std::string const& allocation_mode)
 {
   if (allocation_mode == "binning") return make_binning();
   if (allocation_mode == "cuda") return make_cuda();
   if (allocation_mode == "pool") return make_pool();
-  if (allocation_mode == "managed") make_managed();
+  if (allocation_mode == "managed") return make_managed();
   CUGRAPH_FAIL("Invalid RMM allocation mode");
 }
 
@@ -99,22 +103,22 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
 }  // namespace cugraph
 
 /**
- * @brief Parses the cuDF test command line options.
+ * @brief Parses the cuGraph test command line options.
  *
- * Currently only supports 'rmm_mode' string paramater, which set the rmm
- * allocation mode. The default value of the parameter is 'pool'.
+ * Currently only supports 'rmm_mode' string paramater, which set the rmm allocation mode. The
+ * default value of the parameter is 'pool'.
  *
  * @return Parsing results in the form of cxxopts::ParseResult
  */
-inline auto parse_test_options(int argc, char **argv)
+inline auto parse_test_options(int argc, char** argv)
 {
   try {
-    cxxopts::Options options(argv[0], " - cuDF tests command line options");
+    cxxopts::Options options(argv[0], " - cuGraph tests command line options");
     options.allow_unrecognised_options().add_options()(
       "rmm_mode", "RMM allocation mode", cxxopts::value<std::string>()->default_value("pool"));
 
     return options.parse(argc, argv);
-  } catch (const cxxopts::OptionException &e) {
+  } catch (const cxxopts::OptionException& e) {
     CUGRAPH_FAIL("Error parsing command line options");
   }
 }
@@ -122,16 +126,14 @@ inline auto parse_test_options(int argc, char **argv)
 /**
  * @brief Macro that defines main function for gtest programs that use rmm
  *
- * Should be included in every test program that uses rmm allocators since
- * it maintains the lifespan of the rmm default memory resource.
- * This `main` function is a wrapper around the google test generated `main`,
- * maintaining the original functionality. In addition, this custom `main`
- * function parses the command line to customize test behavior, like the
- * allocation mode used for creating the default memory resource.
- *
+ * Should be included in every test program that uses rmm allocators since it maintains the lifespan
+ * of the rmm default memory resource. This `main` function is a wrapper around the google test
+ * generated `main`, maintaining the original functionality. In addition, this custom `main`
+ * function parses the command line to customize test behavior, like the allocation mode used for
+ * creating the default memory resource.
  */
 #define CUGRAPH_TEST_PROGRAM_MAIN()                                        \
-  int main(int argc, char **argv)                                          \
+  int main(int argc, char** argv)                                          \
   {                                                                        \
     ::testing::InitGoogleTest(&argc, argv);                                \
     auto const cmd_opts = parse_test_options(argc, argv);                  \
@@ -140,3 +142,26 @@ inline auto parse_test_options(int argc, char **argv)
     rmm::mr::set_current_device_resource(resource.get());                  \
     return RUN_ALL_TESTS();                                                \
   }
+
+#define CUGRAPH_MG_TEST_PROGRAM_MAIN()                                                \
+  int main(int argc, char** argv)                                                     \
+  {                                                                                   \
+    MPI_TRY(MPI_Init(&argc, &argv));                                                  \
+    int comm_rank{};                                                                  \
+    int comm_size{};                                                                  \
+    MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank));                               \
+    MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size));                               \
+    int num_gpus{};                                                                   \
+    CUDA_TRY(cudaGetDeviceCount(&num_gpus));                                          \
+    CUGRAPH_EXPECTS(                                                                  \
+      comm_size <= num_gpus, "# MPI ranks (%d) > # GPUs (%d).", comm_size, num_gpus); \
+    CUDA_TRY(cudaSetDevice(comm_rank));                                               \
+    ::testing::InitGoogleTest(&argc, argv);                                           \
+    auto const cmd_opts = parse_test_options(argc, argv);                             \
+    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                     \
+    auto resource       = cugraph::test::create_memory_resource(rmm_mode);            \
+    rmm::mr::set_current_device_resource(resource.get());                             \
+    auto ret = RUN_ALL_TESTS();                                                       \
+    MPI_TRY(MPI_Finalize());                                                          \
+    return ret;                                                                       \
+  }
diff --git a/cpp/tests/utilities/cxxopts.hpp b/cpp/tests/utilities/cxxopts.hpp
index 9a0b6e500d6..5aa77723a1f 100644
--- a/cpp/tests/utilities/cxxopts.hpp
+++ b/cpp/tests/utilities/cxxopts.hpp
@@ -17,6 +17,22 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CXXOPTS_HPP_INCLUDED
 #define CXXOPTS_HPP_INCLUDED
 
@@ -89,7 +105,9 @@ inline String& stringAppend(String& s, String a) { return s.append(std::move(a))
 
 inline String& stringAppend(String& s, int n, UChar32 c)
 {
-  for (int i = 0; i != n; ++i) { s.append(c); }
+  for (int i = 0; i != n; ++i) {
+    s.append(c);
+  }
 
   return s;
 }
@@ -1449,7 +1467,9 @@ inline void Options::generate_all_groups_help(String& result) const
   std::vector<std::string> all_groups;
   all_groups.reserve(m_help.size());
 
-  for (auto& group : m_help) { all_groups.push_back(group.first); }
+  for (auto& group : m_help) {
+    all_groups.push_back(group.first);
+  }
 
   generate_group_help(result, all_groups);
 }
@@ -1494,4 +1514,4 @@ inline const HelpGroupDetails& Options::group_help(const std::string& group) con
 
 }  // namespace cxxopts
 
-#endif  // CXXOPTS_HPP_INCLUDED
\ No newline at end of file
+#endif  // CXXOPTS_HPP_INCLUDED
diff --git a/cpp/tests/utilities/device_comm_wrapper.cu b/cpp/tests/utilities/device_comm_wrapper.cu
new file mode 100644
index 00000000000..2fee7719e36
--- /dev/null
+++ b/cpp/tests/utilities/device_comm_wrapper.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device_comm_wrapper.hpp"
+
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+
+#include <numeric>
+#include <vector>
+
+namespace cugraph {
+namespace test {
+
+template <typename T>
+rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size)
+{
+  bool is_root  = handle.get_comms().get_rank() == int{0};
+  auto rx_sizes = cugraph::experimental::host_scalar_gather(
+    handle.get_comms(), size, int{0}, handle.get_stream());
+  std::vector<size_t> rx_displs(is_root ? static_cast<size_t>(handle.get_comms().get_size())
+                                        : size_t{0});
+  if (is_root) { std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); }
+
+  rmm::device_uvector<T> gathered_v(
+    is_root ? std::reduce(rx_sizes.begin(), rx_sizes.end()) : size_t{0}, handle.get_stream());
+
+  cugraph::experimental::device_gatherv(handle.get_comms(),
+                                        d_input,
+                                        gathered_v.data(),
+                                        size,
+                                        rx_sizes,
+                                        rx_displs,
+                                        int{0},
+                                        handle.get_stream());
+
+  return gathered_v;
+}
+
+// explicit instantiation
+
+template rmm::device_uvector<int32_t> device_gatherv(raft::handle_t const& handle,
+                                                     int32_t const* d_input,
+                                                     size_t size);
+
+template rmm::device_uvector<int64_t> device_gatherv(raft::handle_t const& handle,
+                                                     int64_t const* d_input,
+                                                     size_t size);
+
+template rmm::device_uvector<float> device_gatherv(raft::handle_t const& handle,
+                                                   float const* d_input,
+                                                   size_t size);
+
+template rmm::device_uvector<double> device_gatherv(raft::handle_t const& handle,
+                                                    double const* d_input,
+                                                    size_t size);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/device_comm_wrapper.hpp b/cpp/tests/utilities/device_comm_wrapper.hpp
new file mode 100644
index 00000000000..55145edd71b
--- /dev/null
+++ b/cpp/tests/utilities/device_comm_wrapper.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace test {
+
+template <typename T>
+rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu
new file mode 100644
index 00000000000..711f332ae2f
--- /dev/null
+++ b/cpp/tests/utilities/matrix_market_file_utilities.cu
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/experimental/detail/graph_utils.cuh>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/functions.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <thrust/sequence.h>
+
+#include <cstdint>
+
+namespace cugraph {
+namespace test {
+
+/// Read matrix properties from Matrix Market file
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param t (Output) MM_typecode with matrix properties.
+ *  @param m (Output) Number of matrix rows.
+ *  @param n (Output) Number of matrix columns.
+ *  @param nnz (Output) Number of non-zero matrix entries.
+ *  @return Zero if properties were read successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_>
+int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz)
+{
+  // Read matrix properties from file
+  int mint, nint, nnzint;
+  if (fseek(f, 0, SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if (mm_read_banner(f, t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if (!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if (mm_read_mtx_crd_size(f, &mint, &nint, &nnzint)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if (!mm_is_pattern(*t) && !mm_is_real(*t) && !mm_is_integer(*t) && !mm_is_complex(*t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+  *m   = mint;
+  *n   = nint;
+  *nnz = nnzint;
+
+  // Find total number of non-zero entries
+  if (tg && !mm_is_general(*t)) {
+    // Non-diagonal entries should be counted twice
+    *nnz *= 2;
+
+    // Diagonal entries should not be double-counted
+    int st;
+    for (int i = 0; i < nnzint; ++i) {
+      // Read matrix entry
+      // MTX only supports int for row and col idx
+      int row, col;
+      double rval, ival;
+      if (mm_is_pattern(*t))
+        st = fscanf(f, "%d %d\n", &row, &col);
+      else if (mm_is_real(*t) || mm_is_integer(*t))
+        st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      else  // Complex matrix
+        st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+      if (ferror(f) || (st == EOF)) {
+        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
+        return -1;
+      }
+
+      // Check if entry is diagonal
+      if (row == col) --(*nnz);
+    }
+  }
+
+  return 0;
+}
+
+/// Read Matrix Market file and convert to COO format matrix
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd (Output) Row indices for COO matrix. Should have
+ *  at least nnz entries.
+ *  @param cooColInd (Output) Column indices for COO matrix. Should
+ *  have at least nnz entries.
+ *  @param cooRVal (Output) Real component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @param cooIVal (Output) Imaginary component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @return Zero if matrix was read successfully. Otherwise non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int mm_to_coo(FILE* f,
+              int tg,
+              IndexType_ nnz,
+              IndexType_* cooRowInd,
+              IndexType_* cooColInd,
+              ValueType_* cooRVal,
+              ValueType_* cooIVal)
+{
+  // Read matrix properties from file
+  MM_typecode t;
+  int m, n, nnzOld;
+  if (fseek(f, 0, SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if (mm_read_banner(f, &t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if (!mm_is_matrix(t) || !mm_is_coordinate(t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if (mm_read_mtx_crd_size(f, &m, &n, &nnzOld)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if (!mm_is_pattern(t) && !mm_is_real(t) && !mm_is_integer(t) && !mm_is_complex(t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+
+  // Add each matrix entry in file to COO format matrix
+  int i;      // Entry index in Matrix Market file; can only be int in the MTX format
+  int j = 0;  // Entry index in COO format matrix; can only be int in the MTX format
+  for (i = 0; i < nnzOld; ++i) {
+    // Read entry from file
+    int row, col;
+    double rval, ival;
+    int st;
+    if (mm_is_pattern(t)) {
+      st   = fscanf(f, "%d %d\n", &row, &col);
+      rval = 1.0;
+      ival = 0.0;
+    } else if (mm_is_real(t) || mm_is_integer(t)) {
+      st   = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      ival = 0.0;
+    } else  // Complex matrix
+      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+    if (ferror(f) || (st == EOF)) {
+      fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
+      return -1;
+    }
+
+    // Switch to 0-based indexing
+    --row;
+    --col;
+
+    // Record entry
+    cooRowInd[j] = row;
+    cooColInd[j] = col;
+    if (cooRVal != NULL) cooRVal[j] = rval;
+    if (cooIVal != NULL) cooIVal[j] = ival;
+    ++j;
+
+    // Add symmetric complement of non-diagonal entries
+    if (tg && !mm_is_general(t) && (row != col)) {
+      // Modify entry value if matrix is skew symmetric or Hermitian
+      if (mm_is_skew(t)) {
+        rval = -rval;
+        ival = -ival;
+      } else if (mm_is_hermitian(t)) {
+        ival = -ival;
+      }
+
+      // Record entry
+      cooRowInd[j] = col;
+      cooColInd[j] = row;
+      if (cooRVal != NULL) cooRVal[j] = rval;
+      if (cooIVal != NULL) cooIVal[j] = ival;
+      ++j;
+    }
+  }
+  return 0;
+}
+
+// FIXME: A similar function could be useful for CSC format
+//        There are functions above that operate coo -> csr and coo->csc
+/**
+ * @tparam
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<cugraph::legacy::GraphCSR<vertex_t, edge_t, weight_t>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file)
+{
+  vertex_t number_of_vertices;
+  edge_t number_of_edges;
+
+  FILE* fpin = fopen(mm_file.c_str(), "r");
+  CUGRAPH_EXPECTS(fpin != nullptr, "fopen (%s) failure.", mm_file.c_str());
+
+  vertex_t number_of_columns = 0;
+  MM_typecode mm_typecode{0};
+  CUGRAPH_EXPECTS(
+    mm_properties<vertex_t>(
+      fpin, 1, &mm_typecode, &number_of_vertices, &number_of_columns, &number_of_edges) == 0,
+    "mm_properties query failure.");
+  CUGRAPH_EXPECTS(mm_is_matrix(mm_typecode), "Invalid input file.");
+  CUGRAPH_EXPECTS(mm_is_coordinate(mm_typecode), "Invalid input file.");
+  CUGRAPH_EXPECTS(!mm_is_complex(mm_typecode), "Invalid input file.");
+  CUGRAPH_EXPECTS(!mm_is_skew(mm_typecode), "Invalid input file.");
+
+  directed = !mm_is_symmetric(mm_typecode);
+
+  // Allocate memory on host
+  std::vector<vertex_t> coo_row_ind(number_of_edges);
+  std::vector<vertex_t> coo_col_ind(number_of_edges);
+  std::vector<weight_t> coo_val(number_of_edges);
+
+  // Read
+  CUGRAPH_EXPECTS(
+    (mm_to_coo<vertex_t, weight_t>(
+      fpin, 1, number_of_edges, &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], NULL)) == 0,
+    "file read failure.");
+  CUGRAPH_EXPECTS(fclose(fpin) == 0, "fclose failure.");
+
+  cugraph::legacy::GraphCOOView<vertex_t, edge_t, weight_t> cooview(
+    &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], number_of_vertices, number_of_edges);
+
+  return cugraph::coo_to_csr(cooview);
+}
+
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           vertex_t,
+           bool>
+read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
+                                      std::string const& graph_file_full_path,
+                                      bool test_weighted)
+{
+  MM_typecode mc{};
+  vertex_t m{};
+  size_t nnz{};
+
+  FILE* file = fopen(graph_file_full_path.c_str(), "r");
+  CUGRAPH_EXPECTS(file != nullptr, "fopen (%s) failure.", graph_file_full_path.c_str());
+
+  size_t tmp_m{};
+  size_t tmp_k{};
+  auto mm_ret = cugraph::test::mm_properties<size_t>(file, 1, &mc, &tmp_m, &tmp_k, &nnz);
+  CUGRAPH_EXPECTS(mm_ret == 0, "could not read Matrix Market file properties.");
+  m = static_cast<vertex_t>(tmp_m);
+  CUGRAPH_EXPECTS(mm_is_matrix(mc) && mm_is_coordinate(mc) && !mm_is_complex(mc) && !mm_is_skew(mc),
+                  "invalid Matrix Market file properties.");
+
+  vertex_t number_of_vertices = m;
+  bool is_symmetric           = mm_is_symmetric(mc);
+
+  std::vector<vertex_t> h_rows(nnz);
+  std::vector<vertex_t> h_cols(nnz);
+  std::vector<weight_t> h_weights(nnz);
+
+  mm_ret = cugraph::test::mm_to_coo<vertex_t, weight_t>(
+    file, 1, nnz, h_rows.data(), h_cols.data(), h_weights.data(), static_cast<weight_t*>(nullptr));
+  CUGRAPH_EXPECTS(mm_ret == 0, "could not read matrix data");
+
+  auto file_ret = fclose(file);
+  CUGRAPH_EXPECTS(file_ret == 0, "fclose failure.");
+
+  rmm::device_uvector<vertex_t> d_edgelist_rows(h_rows.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edgelist_cols(h_cols.size(), handle.get_stream());
+  auto d_edgelist_weights = test_weighted ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                              h_weights.size(), handle.get_stream())
+                                          : std::nullopt;
+
+  raft::update_device(d_edgelist_rows.data(), h_rows.data(), h_rows.size(), handle.get_stream());
+  raft::update_device(d_edgelist_cols.data(), h_cols.data(), h_cols.size(), handle.get_stream());
+  if (d_edgelist_weights) {
+    raft::update_device(
+      (*d_edgelist_weights).data(), h_weights.data(), h_weights.size(), handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(d_edgelist_rows),
+                         std::move(d_edgelist_cols),
+                         std::move(d_edgelist_weights),
+                         number_of_vertices,
+                         is_symmetric);
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+read_graph_from_matrix_market_file(raft::handle_t const& handle,
+                                   std::string const& graph_file_full_path,
+                                   bool test_weighted,
+                                   bool renumber)
+{
+  auto [d_edgelist_rows, d_edgelist_cols, d_edgelist_weights, number_of_vertices, is_symmetric] =
+    read_edgelist_from_matrix_market_file<vertex_t, weight_t>(
+      handle, graph_file_full_path, test_weighted);
+
+  rmm::device_uvector<vertex_t> d_vertices(number_of_vertices, handle.get_stream());
+  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   d_vertices.begin(),
+                   d_vertices.end(),
+                   vertex_t{0});
+  handle.get_stream_view().synchronize();
+
+  if (multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto const comm_size     = comm.get_size();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+
+    auto vertex_key_func =
+      cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
+    d_vertices.resize(
+      thrust::distance(
+        d_vertices.begin(),
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          d_vertices.begin(),
+                          d_vertices.end(),
+                          [comm_rank, key_func = vertex_key_func] __device__(auto val) {
+                            return key_func(val) != comm_rank;
+                          })),
+      handle.get_stream());
+    d_vertices.shrink_to_fit(handle.get_stream());
+
+    auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+      comm_size, row_comm_size, col_comm_size};
+    size_t number_of_local_edges{};
+    if (d_edgelist_weights) {
+      auto edge_first       = thrust::make_zip_iterator(thrust::make_tuple(
+        d_edgelist_rows.begin(), d_edgelist_cols.begin(), (*d_edgelist_weights).begin()));
+      number_of_local_edges = thrust::distance(
+        edge_first,
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          edge_first,
+                          edge_first + d_edgelist_rows.size(),
+                          [comm_rank, key_func = edge_key_func] __device__(auto e) {
+                            auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e);
+                            auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e);
+                            return key_func(major, minor) != comm_rank;
+                          }));
+    } else {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin()));
+      number_of_local_edges = thrust::distance(
+        edge_first,
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          edge_first,
+                          edge_first + d_edgelist_rows.size(),
+                          [comm_rank, key_func = edge_key_func] __device__(auto e) {
+                            auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e);
+                            auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e);
+                            return key_func(major, minor) != comm_rank;
+                          }));
+    }
+
+    d_edgelist_rows.resize(number_of_local_edges, handle.get_stream());
+    d_edgelist_rows.shrink_to_fit(handle.get_stream());
+    d_edgelist_cols.resize(number_of_local_edges, handle.get_stream());
+    d_edgelist_cols.shrink_to_fit(handle.get_stream());
+    if (d_edgelist_weights) {
+      (*d_edgelist_weights).resize(number_of_local_edges, handle.get_stream());
+      (*d_edgelist_weights).shrink_to_fit(handle.get_stream());
+    }
+  }
+
+  handle.get_stream_view().synchronize();
+  return cugraph::experimental::
+    create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      std::optional<std::tuple<vertex_t const*, vertex_t>>{
+        std::make_tuple(d_vertices.data(), static_cast<vertex_t>(d_vertices.size()))},
+      std::move(d_edgelist_rows),
+      std::move(d_edgelist_cols),
+      std::move(d_edgelist_weights),
+      cugraph::experimental::graph_properties_t{is_symmetric, false},
+      renumber);
+}
+
+// explicit instantiations
+
+template int32_t mm_to_coo(FILE* f,
+                           int32_t tg,
+                           int32_t nnz,
+                           int32_t* cooRowInd,
+                           int32_t* cooColInd,
+                           int32_t* cooRVal,
+                           int32_t* cooIVal);
+
+template int32_t mm_to_coo(FILE* f,
+                           int32_t tg,
+                           int32_t nnz,
+                           int32_t* cooRowInd,
+                           int32_t* cooColInd,
+                           double* cooRVal,
+                           double* cooIVal);
+
+template int32_t mm_to_coo(FILE* f,
+                           int32_t tg,
+                           int32_t nnz,
+                           int32_t* cooRowInd,
+                           int32_t* cooColInd,
+                           float* cooRVal,
+                           float* cooIVal);
+
+template std::unique_ptr<cugraph::legacy::GraphCSR<int32_t, int32_t, float>>
+generate_graph_csr_from_mm(bool& directed, std::string mm_file);
+
+template std::unique_ptr<cugraph::legacy::GraphCSR<uint32_t, uint32_t, float>>
+generate_graph_csr_from_mm(bool& directed, std::string mm_file);
+
+template std::unique_ptr<cugraph::legacy::GraphCSR<int32_t, int32_t, double>>
+generate_graph_csr_from_mm(bool& directed, std::string mm_file);
+
+template std::unique_ptr<cugraph::legacy::GraphCSR<int64_t, int64_t, float>>
+generate_graph_csr_from_mm(bool& directed, std::string mm_file);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, float, false, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, float, false, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, float, true, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, float, true, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, double, false, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, double, false, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, double, true, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int32_t, double, true, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, double, false, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+read_graph_from_matrix_market_file<int32_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, double, false, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+read_graph_from_matrix_market_file<int64_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  std::string const& graph_file_full_path,
+  bool test_weighted,
+  bool renumber);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/misc_utilities.cpp b/cpp/tests/utilities/misc_utilities.cpp
new file mode 100644
index 00000000000..14f0df2f35d
--- /dev/null
+++ b/cpp/tests/utilities/misc_utilities.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/test_utilities.hpp>
+
+namespace cugraph {
+namespace test {
+
+std::string getFileName(const std::string& s)
+{
+  char sep = '/';
+#ifdef _WIN32
+  sep = '\\';
+#endif
+  size_t i = s.rfind(sep, s.length());
+  if (i != std::string::npos) { return (s.substr(i + 1, s.length() - i)); }
+  return ("");
+}
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/rmat_utilities.cpp b/cpp/tests/utilities/rmat_utilities.cpp
new file mode 100644
index 00000000000..8502fe1b1c9
--- /dev/null
+++ b/cpp/tests/utilities/rmat_utilities.cpp
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <cstdint>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+generate_graph_from_rmat_params(raft::handle_t const& handle,
+                                size_t scale,
+                                size_t edge_factor,
+                                double a,
+                                double b,
+                                double c,
+                                uint64_t base_seed,
+                                bool undirected,
+                                bool scramble_vertex_ids,
+                                bool test_weighted,
+                                bool renumber,
+                                std::vector<size_t> const& partition_ids,
+                                size_t num_partitions)
+{
+  CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true.");
+  CUGRAPH_EXPECTS(size_t{1} << scale <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+                  "vertex_t overflow.");
+  CUGRAPH_EXPECTS(
+    (size_t{1} << scale) * edge_factor <= static_cast<size_t>(std::numeric_limits<edge_t>::max()),
+    " edge_t overflow.");
+
+  vertex_t number_of_vertices = static_cast<vertex_t>(size_t{1} << scale);
+  edge_t number_of_edges =
+    static_cast<edge_t>(static_cast<size_t>(number_of_vertices) * edge_factor);
+
+  std::vector<edge_t> partition_edge_counts(partition_ids.size());
+  std::vector<vertex_t> partition_vertex_firsts(partition_ids.size());
+  std::vector<vertex_t> partition_vertex_lasts(partition_ids.size());
+  for (size_t i = 0; i < partition_ids.size(); ++i) {
+    auto id = partition_ids[i];
+
+    partition_edge_counts[i] = number_of_edges / num_partitions +
+                               (id < number_of_edges % num_partitions ? edge_t{1} : edge_t{0});
+
+    partition_vertex_firsts[i] = (number_of_vertices / num_partitions) * id;
+    partition_vertex_lasts[i]  = (number_of_vertices / num_partitions) * (id + 1);
+    if (id < number_of_vertices % num_partitions) {
+      partition_vertex_firsts[i] += id;
+      partition_vertex_lasts[i] += id + 1;
+    } else {
+      partition_vertex_firsts[i] += number_of_vertices % num_partitions;
+      partition_vertex_lasts[i] += number_of_vertices % num_partitions;
+    }
+  }
+
+  rmm::device_uvector<vertex_t> d_edgelist_rows(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edgelist_cols(0, handle.get_stream());
+  auto d_edgelist_weights =
+    test_weighted ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
+                  : std::nullopt;
+  for (size_t i = 0; i < partition_ids.size(); ++i) {
+    auto id = partition_ids[i];
+
+    rmm::device_uvector<vertex_t> d_tmp_rows(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> d_tmp_cols(0, handle.get_stream());
+    std::tie(i == 0 ? d_edgelist_rows : d_tmp_rows, i == 0 ? d_edgelist_cols : d_tmp_cols) =
+      cugraph::generate_rmat_edgelist<vertex_t>(handle,
+                                                scale,
+                                                partition_edge_counts[i],
+                                                a,
+                                                b,
+                                                c,
+                                                base_seed + id,
+                                                undirected ? true : false);
+
+    std::optional<rmm::device_uvector<weight_t>> d_tmp_weights{std::nullopt};
+    if (d_edgelist_weights) {
+      if (i == 0) {
+        (*d_edgelist_weights).resize(d_edgelist_rows.size(), handle.get_stream());
+      } else {
+        d_tmp_weights =
+          std::make_optional<rmm::device_uvector<weight_t>>(d_tmp_rows.size(), handle.get_stream());
+      }
+
+      cugraph::detail::uniform_random_fill(
+        handle.get_stream_view(),
+        i == 0 ? (*d_edgelist_weights).data() : (*d_tmp_weights).data(),
+        i == 0 ? (*d_edgelist_weights).size() : (*d_tmp_weights).size(),
+        weight_t{0.0},
+        weight_t{1.0},
+        base_seed + num_partitions + id);
+    }
+
+    if (i > 0) {
+      auto start_offset = d_edgelist_rows.size();
+      d_edgelist_rows.resize(start_offset + d_tmp_rows.size(), handle.get_stream());
+      d_edgelist_cols.resize(d_edgelist_rows.size(), handle.get_stream());
+      raft::copy(d_edgelist_rows.begin() + start_offset,
+                 d_tmp_rows.begin(),
+                 d_tmp_rows.size(),
+                 handle.get_stream());
+      raft::copy(d_edgelist_cols.begin() + start_offset,
+                 d_tmp_cols.begin(),
+                 d_tmp_cols.size(),
+                 handle.get_stream());
+      if (d_edgelist_weights) {
+        (*d_edgelist_weights).resize(d_edgelist_rows.size(), handle.get_stream());
+        raft::copy(d_edgelist_weights->begin() + start_offset,
+                   d_tmp_weights->begin(),
+                   d_tmp_weights->size(),
+                   handle.get_stream());
+      }
+    }
+  }
+
+  if (undirected) {
+// FIXME: may need to undo this and handle symmetrization elsewhere once the new test graph
+// generation API gets integrated
+#if 1
+    std::tie(d_edgelist_rows, d_edgelist_cols, d_edgelist_weights) =
+      cugraph::symmetrize_edgelist<vertex_t, weight_t>(
+        handle,
+        std::move(d_edgelist_rows),
+        std::move(d_edgelist_cols),
+        test_weighted ? std::optional<rmm::device_uvector<weight_t>>(std::move(d_edgelist_weights))
+                      : std::nullopt);
+#endif
+  }
+
+  if (multi_gpu) {
+    std::tie(d_edgelist_rows, d_edgelist_cols, d_edgelist_weights) =
+      cugraph::detail::shuffle_edgelist_by_edge(
+        handle, d_edgelist_rows, d_edgelist_cols, d_edgelist_weights, store_transposed);
+  }
+
+  rmm::device_uvector<vertex_t> d_vertices(0, handle.get_stream());
+  for (size_t i = 0; i < partition_ids.size(); ++i) {
+    auto id = partition_ids[i];
+
+    auto start_offset = d_vertices.size();
+    d_vertices.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]),
+                      handle.get_stream());
+    cugraph::detail::sequence_fill(handle.get_stream_view(),
+                                   d_vertices.begin() + start_offset,
+                                   d_vertices.size() - start_offset,
+                                   partition_vertex_firsts[i]);
+  }
+
+  if (multi_gpu) { d_vertices = cugraph::detail::shuffle_vertices(handle, d_vertices); }
+
+  return cugraph::experimental::
+    create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      std::optional<std::tuple<vertex_t const*, vertex_t>>{
+        std::make_tuple(d_vertices.data(), static_cast<vertex_t>(d_vertices.size()))},
+      std::move(d_edgelist_rows),
+      std::move(d_edgelist_cols),
+      std::move(d_edgelist_weights),
+      cugraph::experimental::graph_properties_t{undirected, true},
+      renumber);
+}  // namespace test
+
+// explicit instantiations
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, float, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, float, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, float, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, float, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, double, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, double, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, double, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int32_t, double, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, double, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+generate_graph_from_rmat_params<int32_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, double, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, false>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, true>,
+                    std::optional<rmm::device_uvector<int64_t>>>
+generate_graph_from_rmat_params<int64_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
new file mode 100644
index 00000000000..150f2790277
--- /dev/null
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/graph_generators.hpp>
+
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+namespace cugraph {
+namespace test {
+
+namespace detail {
+
+class TranslateGraph_Usecase {
+ public:
+  TranslateGraph_Usecase() = delete;
+  TranslateGraph_Usecase(size_t base_vertex_id = 0) : base_vertex_id_(base_vertex_id) {}
+
+  template <typename vertex_t>
+  void translate(raft::handle_t const& handle,
+                 rmm::device_uvector<vertex_t>& d_src,
+                 rmm::device_uvector<vertex_t>& d_dst) const
+  {
+    if (base_vertex_id_ > 0)
+      cugraph::test::translate_vertex_ids(
+        handle, d_src, d_dst, static_cast<vertex_t>(base_vertex_id_));
+  }
+
+  size_t base_vertex_id_{};
+};
+
+}  // namespace detail
+
+class File_Usecase : public detail::TranslateGraph_Usecase {
+ public:
+  File_Usecase() = delete;
+
+  File_Usecase(std::string const& graph_file_path, size_t base_vertex_id = 0)
+    : detail::TranslateGraph_Usecase(base_vertex_id)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path_ = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path_ = graph_file_path;
+    }
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const
+  {
+    auto [d_src_v, d_dst_v, d_weights_v, num_vertices, is_symmetric] =
+      read_edgelist_from_matrix_market_file<vertex_t, weight_t>(
+        handle, graph_file_full_path_, test_weighted);
+
+    translate(handle, d_src_v, d_dst_v);
+
+#if 0
+    if (multi_gpu) {
+      std::tie(d_src_v, d_dst_v) = filter_edgelist_by_gpu(handle, d_src_v, d_dst_v);
+    }
+#endif
+
+    return std::make_tuple(
+      std::move(d_src_v),
+      std::move(d_dst_v),
+      std::move(d_weights_v),
+      static_cast<vertex_t>(detail::TranslateGraph_Usecase::base_vertex_id_) + num_vertices,
+      is_symmetric);
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const
+  {
+    auto [d_src_v, d_dst_v, d_weights_v, num_vertices, is_symmetric] =
+      this->template construct_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+        handle, test_weighted);
+
+    // TODO:  Consider calling construct_edgelist and creating
+    //        a generic test function to take the edgelist and
+    //        do the graph construction.
+    //
+    //        Would be more reusable across tests
+    //
+    return read_graph_from_matrix_market_file<vertex_t,
+                                              edge_t,
+                                              weight_t,
+                                              store_transposed,
+                                              multi_gpu>(
+      handle, graph_file_full_path_, test_weighted, renumber);
+  }
+
+ private:
+  std::string graph_file_full_path_{};
+};
+
+class Rmat_Usecase : public detail::TranslateGraph_Usecase {
+ public:
+  Rmat_Usecase() = delete;
+
+  Rmat_Usecase(size_t scale,
+               size_t edge_factor,
+               double a,
+               double b,
+               double c,
+               uint64_t seed,
+               bool undirected,
+               bool scramble_vertex_ids,
+               size_t base_vertex_id  = 0,
+               bool multi_gpu_usecase = false)
+    : detail::TranslateGraph_Usecase(base_vertex_id),
+      scale_(scale),
+      edge_factor_(edge_factor),
+      a_(a),
+      b_(b),
+      c_(c),
+      seed_(seed),
+      undirected_(undirected),
+      scramble_vertex_ids_(scramble_vertex_ids),
+      multi_gpu_usecase_(multi_gpu_usecase)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const
+  {
+    // TODO: Tease through generate_graph_from_rmat_params
+    //       to extract the edgelist part
+    // Call cugraph::translate_vertex_ids(handle, d_src_v, d_dst_v, base_vertex_id_);
+
+    CUGRAPH_FAIL("Not implemented");
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const
+  {
+    std::vector<size_t> partition_ids(1);
+    size_t comm_size;
+
+    if (multi_gpu_usecase_) {
+      auto& comm           = handle.get_comms();
+      comm_size            = comm.get_size();
+      auto const comm_rank = comm.get_rank();
+
+      partition_ids.resize(multi_gpu ? size_t{1} : static_cast<size_t>(comm_size));
+
+      std::iota(partition_ids.begin(),
+                partition_ids.end(),
+                multi_gpu ? static_cast<size_t>(comm_rank) : size_t{0});
+    } else {
+      comm_size        = 1;
+      partition_ids[0] = size_t{0};
+    }
+
+    // TODO: Need to offset by base_vertex_id_
+    // static_cast<vertex_t>(base_vertex_id_));
+    //    Consider using construct_edgelist like other options
+    return generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      scale_,
+      edge_factor_,
+      a_,
+      b_,
+      c_,
+      seed_,
+      undirected_,
+      scramble_vertex_ids_,
+      test_weighted,
+      renumber,
+      partition_ids,
+      comm_size);
+  }
+
+ private:
+  size_t scale_{};
+  size_t edge_factor_{};
+  double a_{};
+  double b_{};
+  double c_{};
+  uint64_t seed_{};
+  bool undirected_{};
+  bool scramble_vertex_ids_{};
+  bool multi_gpu_usecase_{};
+};
+
+class PathGraph_Usecase {
+ public:
+  PathGraph_Usecase() = delete;
+
+  PathGraph_Usecase(std::vector<std::tuple<size_t, size_t>> parms,
+                    bool weighted = false,
+                    bool scramble = false)
+    : parms_(parms), weighted_(weighted)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const
+  {
+    constexpr bool symmetric{true};
+
+    std::vector<std::tuple<vertex_t, vertex_t>> converted_parms(parms_.size());
+
+    std::transform(parms_.begin(), parms_.end(), converted_parms.begin(), [](auto p) {
+      return std::make_tuple(static_cast<vertex_t>(std::get<0>(p)),
+                             static_cast<vertex_t>(std::get<1>(p)));
+    });
+
+    auto [src_v, dst_v] = cugraph::generate_path_graph_edgelist<vertex_t>(handle, converted_parms);
+    std::tie(src_v, dst_v, std::ignore) = cugraph::symmetrize_edgelist<vertex_t, weight_t>(
+      handle, std::move(src_v), std::move(dst_v), std::nullopt);
+
+    return std::make_tuple(std::move(src_v),
+                           std::move(dst_v),
+                           test_weighted ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                             src_v.size(), handle.get_stream())
+                                         : std::nullopt,
+                           num_vertices_,
+                           symmetric);
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const
+  {
+    CUGRAPH_FAIL("not implemented");
+  }
+
+ private:
+  std::vector<std::tuple<size_t, size_t>> parms_{};
+  size_t num_vertices_{0};
+  bool weighted_{false};
+};
+
+class Mesh2DGraph_Usecase {
+ public:
+  Mesh2DGraph_Usecase() = delete;
+
+  Mesh2DGraph_Usecase(std::vector<std::tuple<size_t, size_t, size_t>> const& parms, bool weighted)
+    : parms_(parms), weighted_(weighted)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const;
+
+ private:
+  std::vector<std::tuple<size_t, size_t, size_t>> parms_{};
+  bool weighted_{false};
+};
+
+class Mesh3DGraph_Usecase {
+ public:
+  Mesh3DGraph_Usecase() = delete;
+
+  Mesh3DGraph_Usecase(std::vector<std::tuple<size_t, size_t, size_t, size_t>> const& parms,
+                      bool weighted)
+    : parms_(parms), weighted_(weighted)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const;
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const;
+
+ private:
+  std::vector<std::tuple<size_t, size_t, size_t, size_t>> parms_{};
+  bool weighted_{false};
+};
+
+class CompleteGraph_Usecase {
+ public:
+  CompleteGraph_Usecase() = delete;
+
+  CompleteGraph_Usecase(std::vector<std::tuple<size_t, size_t>> const& parms, bool weighted)
+    : parms_(parms), weighted_(weighted)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const;
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const;
+
+ private:
+  std::vector<std::tuple<size_t, size_t>> parms_{};
+  bool weighted_{false};
+};
+
+namespace detail {
+
+template <typename generator_tuple_t, size_t I, size_t N>
+struct combined_construct_graph_tuple_impl {
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::vector<std::tuple<rmm::device_uvector<vertex_t>,
+                         rmm::device_uvector<vertex_t>,
+                         rmm::device_uvector<weight_t>,
+                         vertex_t,
+                         bool>>
+  construct_edges(raft::handle_t const& handle,
+                  bool test_weighted,
+                  generator_tuple_t const& generator_tuple) const
+  {
+    return combined_construct_graph_tuple_impl<generator_tuple_t, I + 1, N>()
+      .construct_edges(generator_tuple)
+      .push_back(std::get<I>(generator_tuple).construct_edges(handle, test_weighted));
+  }
+};
+
+template <typename generator_tuple_t, size_t I>
+struct combined_construct_graph_tuple_impl<generator_tuple_t, I, I> {
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::vector<std::tuple<rmm::device_uvector<vertex_t>,
+                         rmm::device_uvector<vertex_t>,
+                         rmm::device_uvector<weight_t>,
+                         vertex_t,
+                         bool>>
+  construct_edges(raft::handle_t const& handle,
+                  bool test_weighted,
+                  generator_tuple_t const& generator_tuple) const
+  {
+    return std::vector<std::tuple<rmm::device_uvector<vertex_t>,
+                                  rmm::device_uvector<vertex_t>,
+                                  rmm::device_uvector<weight_t>,
+                                  vertex_t,
+                                  bool>>();
+  }
+};
+
+}  // namespace detail
+
+template <typename generator_tuple_t>
+class CombinedGenerator_Usecase {
+  CombinedGenerator_Usecase() = delete;
+
+  CombinedGenerator_Usecase(generator_tuple_t const& tuple) : generator_tuple_(tuple) {}
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>,
+             vertex_t,
+             bool>
+  construct_edgelist(raft::handle_t const& handle, bool test_weighted) const
+  {
+    size_t constexpr tuple_size{std::tuple_size<generator_tuple_t>::value};
+
+    auto edge_tuple_vector =
+      detail::combined_construct_graph_tuple_impl<generator_tuple_t, 0, tuple_size>()
+        .construct_edges(handle, test_weighted, generator_tuple_);
+
+    // Need to combine
+    CUGRAPH_FAIL("not implemented");
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            bool store_transposed,
+            bool multi_gpu>
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    std::optional<rmm::device_uvector<vertex_t>>>
+  construct_graph(raft::handle_t const& handle, bool test_weighted, bool renumber = true) const
+  {
+    // Call construct_edgelist to get tuple of edge lists
+    // return generate_graph_from_edgelist<...>(...)
+    CUGRAPH_FAIL("not implemented");
+  }
+
+ private:
+  generator_tuple_t const& generator_tuple_;
+};
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu,
+          typename input_usecase_t>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+construct_graph(raft::handle_t const& handle,
+                input_usecase_t const& input_usecase,
+                bool test_weighted,
+                bool renumber = true)
+{
+  auto [d_src_v, d_dst_v, d_weights_v, num_vertices, is_symmetric] =
+    input_usecase
+      .template construct_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+        handle, test_weighted);
+
+  return cugraph::experimental::
+    create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+      handle,
+      std::nullopt,
+      std::move(d_src_v),
+      std::move(d_dst_v),
+      std::move(d_weights_v),
+      cugraph::experimental::graph_properties_t{is_symmetric, false},
+      renumber);
+}
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 518e7c2860e..b452ff9a95f 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,38 +15,29 @@
  */
 #pragma once
 
-#include <experimental/graph.hpp>
-#include <functions.hpp>
-#include <utilities/error.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_functions.hpp>
+#include <cugraph/legacy/graph.hpp>
 
-#include <gtest/gtest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <numeric>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <vector>
 
 extern "C" {
 #include "mmio.h"
 }
 
-#include <gtest/gtest.h>
-
-#include <cfloat>
-#include <cstdio>
-#include <string>
-#include <vector>
-
 namespace cugraph {
 namespace test {
 
-std::string getFileName(const std::string& s)
-{
-  char sep = '/';
-
-#ifdef _WIN32
-  sep = '\\';
-#endif
-
-  size_t i = s.rfind(sep, s.length());
-  if (i != std::string::npos) { return (s.substr(i + 1, s.length() - i)); }
-  return ("");
-}
+std::string getFileName(const std::string& s);
 
 /// Read matrix properties from Matrix Market file
 /** Matrix Market file is assumed to be a sparse matrix in coordinate
@@ -63,64 +54,7 @@ std::string getFileName(const std::string& s)
  *  non-zero.
  */
 template <typename IndexType_>
-int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz)
-{
-  // Read matrix properties from file
-  int mint, nint, nnzint;
-  if (fseek(f, 0, SEEK_SET)) {
-    fprintf(stderr, "Error: could not set position in file\n");
-    return -1;
-  }
-  if (mm_read_banner(f, t)) {
-    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
-    return -1;
-  }
-  if (!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
-    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
-    return -1;
-  }
-  if (mm_read_mtx_crd_size(f, &mint, &nint, &nnzint)) {
-    fprintf(stderr, "Error: could not read matrix dimensions\n");
-    return -1;
-  }
-  if (!mm_is_pattern(*t) && !mm_is_real(*t) && !mm_is_integer(*t) && !mm_is_complex(*t)) {
-    fprintf(stderr, "Error: matrix entries are not valid type\n");
-    return -1;
-  }
-  *m   = mint;
-  *n   = nint;
-  *nnz = nnzint;
-
-  // Find total number of non-zero entries
-  if (tg && !mm_is_general(*t)) {
-    // Non-diagonal entries should be counted twice
-    *nnz *= 2;
-
-    // Diagonal entries should not be double-counted
-    int st;
-    for (int i = 0; i < nnzint; ++i) {
-      // Read matrix entry
-      // MTX only supports int for row and col idx
-      int row, col;
-      double rval, ival;
-      if (mm_is_pattern(*t))
-        st = fscanf(f, "%d %d\n", &row, &col);
-      else if (mm_is_real(*t) || mm_is_integer(*t))
-        st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
-      else  // Complex matrix
-        st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
-      if (ferror(f) || (st == EOF)) {
-        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
-        return -1;
-      }
-
-      // Check if entry is diagonal
-      if (row == col) --(*nnz);
-    }
-  }
-
-  return 0;
-}
+int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz);
 
 /// Read Matrix Market file and convert to COO format matrix
 /** Matrix Market file is assumed to be a sparse matrix in coordinate
@@ -149,169 +83,16 @@ int mm_to_coo(FILE* f,
               IndexType_* cooRowInd,
               IndexType_* cooColInd,
               ValueType_* cooRVal,
-              ValueType_* cooIVal)
-{
-  // Read matrix properties from file
-  MM_typecode t;
-  int m, n, nnzOld;
-  if (fseek(f, 0, SEEK_SET)) {
-    fprintf(stderr, "Error: could not set position in file\n");
-    return -1;
-  }
-  if (mm_read_banner(f, &t)) {
-    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
-    return -1;
-  }
-  if (!mm_is_matrix(t) || !mm_is_coordinate(t)) {
-    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
-    return -1;
-  }
-  if (mm_read_mtx_crd_size(f, &m, &n, &nnzOld)) {
-    fprintf(stderr, "Error: could not read matrix dimensions\n");
-    return -1;
-  }
-  if (!mm_is_pattern(t) && !mm_is_real(t) && !mm_is_integer(t) && !mm_is_complex(t)) {
-    fprintf(stderr, "Error: matrix entries are not valid type\n");
-    return -1;
-  }
-
-  // Add each matrix entry in file to COO format matrix
-  int i;      // Entry index in Matrix Market file; can only be int in the MTX format
-  int j = 0;  // Entry index in COO format matrix; can only be int in the MTX format
-  for (i = 0; i < nnzOld; ++i) {
-    // Read entry from file
-    int row, col;
-    double rval, ival;
-    int st;
-    if (mm_is_pattern(t)) {
-      st   = fscanf(f, "%d %d\n", &row, &col);
-      rval = 1.0;
-      ival = 0.0;
-    } else if (mm_is_real(t) || mm_is_integer(t)) {
-      st   = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
-      ival = 0.0;
-    } else  // Complex matrix
-      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
-    if (ferror(f) || (st == EOF)) {
-      fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
-      return -1;
-    }
-
-    // Switch to 0-based indexing
-    --row;
-    --col;
-
-    // Record entry
-    cooRowInd[j] = row;
-    cooColInd[j] = col;
-    if (cooRVal != NULL) cooRVal[j] = rval;
-    if (cooIVal != NULL) cooIVal[j] = ival;
-    ++j;
-
-    // Add symmetric complement of non-diagonal entries
-    if (tg && !mm_is_general(t) && (row != col)) {
-      // Modify entry value if matrix is skew symmetric or Hermitian
-      if (mm_is_skew(t)) {
-        rval = -rval;
-        ival = -ival;
-      } else if (mm_is_hermitian(t)) {
-        ival = -ival;
-      }
-
-      // Record entry
-      cooRowInd[j] = col;
-      cooColInd[j] = row;
-      if (cooRVal != NULL) cooRVal[j] = rval;
-      if (cooIVal != NULL) cooIVal[j] = ival;
-      ++j;
-    }
-  }
-  return 0;
-}
-
-int read_binary_vector(FILE* fpin, int n, std::vector<float>& val)
-{
-  size_t is_read1;
-
-  double* t_storage = new double[n];
-  is_read1          = fread(t_storage, sizeof(double), n, fpin);
-  for (int i = 0; i < n; i++) {
-    if (t_storage[i] == DBL_MAX)
-      val[i] = FLT_MAX;
-    else if (t_storage[i] == -DBL_MAX)
-      val[i] = -FLT_MAX;
-    else
-      val[i] = static_cast<float>(t_storage[i]);
-  }
-  delete[] t_storage;
-
-  if (is_read1 != (size_t)n) {
-    printf("%s", "I/O fail\n");
-    return 1;
-  }
-  return 0;
-}
-
-int read_binary_vector(FILE* fpin, int n, std::vector<double>& val)
-{
-  size_t is_read1;
-
-  is_read1 = fread(&val[0], sizeof(double), n, fpin);
-
-  if (is_read1 != (size_t)n) {
-    printf("%s", "I/O fail\n");
-    return 1;
-  }
-  return 0;
-}
+              ValueType_* cooIVal);
 
 // FIXME: A similar function could be useful for CSC format
 //        There are functions above that operate coo -> csr and coo->csc
 /**
  * @tparam
  */
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<cugraph::GraphCSR<VT, ET, WT>> generate_graph_csr_from_mm(bool& directed,
-                                                                          std::string mm_file)
-{
-  VT number_of_vertices;
-  ET number_of_edges;
-
-  FILE* fpin = fopen(mm_file.c_str(), "r");
-  EXPECT_NE(fpin, nullptr);
-
-  VT number_of_columns = 0;
-  MM_typecode mm_typecode{0};
-  EXPECT_EQ(mm_properties<VT>(
-              fpin, 1, &mm_typecode, &number_of_vertices, &number_of_columns, &number_of_edges),
-            0);
-  EXPECT_TRUE(mm_is_matrix(mm_typecode));
-  EXPECT_TRUE(mm_is_coordinate(mm_typecode));
-  EXPECT_FALSE(mm_is_complex(mm_typecode));
-  EXPECT_FALSE(mm_is_skew(mm_typecode));
-
-  directed = !mm_is_symmetric(mm_typecode);
-
-  // Allocate memory on host
-  std::vector<VT> coo_row_ind(number_of_edges);
-  std::vector<VT> coo_col_ind(number_of_edges);
-  std::vector<WT> coo_val(number_of_edges);
-
-  // Read
-  EXPECT_EQ((mm_to_coo<VT, WT>(
-              fpin, 1, number_of_edges, &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], NULL)),
-            0);
-  EXPECT_EQ(fclose(fpin), 0);
-
-  cugraph::GraphCOOView<VT, ET, WT> cooview(
-    &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], number_of_vertices, number_of_edges);
-
-  return cugraph::coo_to_csr(cooview);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// FIXME: move this code to rapids-core
-////////////////////////////////////////////////////////////////////////////////
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<cugraph::legacy::GraphCSR<vertex_t, edge_t, weight_t>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file);
 
 // Define RAPIDS_DATASET_ROOT_DIR using a preprocessor variable to
 // allow for a build to override the default. This is useful for
@@ -331,89 +112,260 @@ static const std::string& get_rapids_dataset_root_dir()
   return rdrd;
 }
 
+// returns a tuple of (rows, columns, weights, number_of_vertices, is_symmetric)
 template <typename vertex_t, typename weight_t>
-struct edgelist_from_market_matrix_file_t {
-  std::vector<vertex_t> h_rows{};
-  std::vector<vertex_t> h_cols{};
-  std::vector<weight_t> h_weights{};
-  vertex_t number_of_vertices{};
-  bool is_symmetric{};
-};
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           vertex_t,
+           bool>
+read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
+                                      std::string const& graph_file_full_path,
+                                      bool test_weighted);
+
+// renumber must be true if multi_gpu is true
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+read_graph_from_matrix_market_file(raft::handle_t const& handle,
+                                   std::string const& graph_file_full_path,
+                                   bool test_weighted,
+                                   bool renumber);
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           std::optional<rmm::device_uvector<vertex_t>>>
+generate_graph_from_rmat_params(raft::handle_t const& handle,
+                                size_t scale,
+                                size_t edge_factor,
+                                double a,
+                                double b,
+                                double c,
+                                uint64_t seed,
+                                bool undirected,
+                                bool scramble_vertex_ids,
+                                bool test_weighted,
+                                bool renumber,
+                                std::vector<size_t> const& partition_ids,
+                                size_t num_partitions);
+
+// alias for easy customization for debug purposes:
+//
+template <typename value_t>
+using vector_test_t = rmm::device_uvector<value_t>;
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-edgelist_from_market_matrix_file_t<vertex_t, weight_t> read_edgelist_from_matrix_market_file(
-  std::string const& graph_file_full_path)
+decltype(auto) make_graph(raft::handle_t const& handle,
+                          std::vector<vertex_t> const& v_src,
+                          std::vector<vertex_t> const& v_dst,
+                          std::optional<std::vector<weight_t>> const& v_w,
+                          vertex_t num_vertices,
+                          edge_t num_edges)
+{
+  using namespace cugraph::experimental;
+
+  vector_test_t<vertex_t> d_src(num_edges, handle.get_stream());
+  vector_test_t<vertex_t> d_dst(num_edges, handle.get_stream());
+  auto d_w = v_w ? std::make_optional<vector_test_t<weight_t>>(num_edges, handle.get_stream())
+                 : std::nullopt;
+
+  raft::update_device(d_src.data(), v_src.data(), d_src.size(), handle.get_stream());
+  raft::update_device(d_dst.data(), v_dst.data(), d_dst.size(), handle.get_stream());
+  if (d_w) {
+    raft::update_device((*d_w).data(), (*v_w).data(), (*d_w).size(), handle.get_stream());
+  }
+
+  cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+  std::tie(graph, std::ignore) =
+    cugraph::experimental::create_graph_from_edgelist<vertex_t, edge_t, weight_t, false, false>(
+      handle,
+      std::nullopt,
+      std::move(d_src),
+      std::move(d_dst),
+      std::move(d_w),
+      cugraph::experimental::graph_properties_t{false, false},
+      false);
+
+  return graph;
+}
+
+// compares single GPU CSR graph data:
+// (for testing / debugging);
+// on first == false, second == brief description of what is different;
+//
+template <typename left_graph_t, typename right_graph_t>
+std::pair<bool, std::string> compare_graphs(raft::handle_t const& handle,
+                                            left_graph_t const& lgraph,
+                                            right_graph_t const& rgraph)
+{
+  if constexpr (left_graph_t::is_multi_gpu && right_graph_t::is_multi_gpu) {
+    // no support for comparing distributed graphs, yet:
+    //
+    CUGRAPH_FAIL("Unsupported graph type for comparison.");
+    return std::make_pair(false, std::string("unsupported"));
+  } else if constexpr (!std::is_same_v<left_graph_t, right_graph_t>) {
+    return std::make_pair(false, std::string("type"));
+  } else {
+    // both graphs are single GPU:
+    //
+    using graph_t = left_graph_t;
+
+    using vertex_t = typename graph_t::vertex_type;
+    using edge_t   = typename graph_t::edge_type;
+    using weight_t = typename graph_t::weight_type;
+
+    size_t num_vertices = lgraph.get_number_of_vertices();
+    size_t num_edges    = lgraph.get_number_of_edges();
+
+    {
+      size_t r_num_vertices = rgraph.get_number_of_vertices();
+      size_t r_num_edges    = rgraph.get_number_of_edges();
+
+      if (num_vertices != r_num_vertices) return std::make_pair(false, std::string("num_vertices"));
+
+      if (num_edges != r_num_edges) return std::make_pair(false, std::string("num_edges"));
+    }
+
+    if (lgraph.is_symmetric() != rgraph.is_symmetric())
+      return std::make_pair(false, std::string("symmetric"));
+
+    if (lgraph.is_multigraph() != rgraph.is_multigraph())
+      return std::make_pair(false, std::string("multigraph"));
+
+    bool is_weighted = lgraph.is_weighted();
+    if (is_weighted != rgraph.is_weighted()) return std::make_pair(false, std::string("weighted"));
+
+    auto lgraph_view = lgraph.view();
+    auto rgraph_view = rgraph.view();
+
+    std::vector<edge_t> lv_ro(num_vertices + 1);
+    std::vector<vertex_t> lv_ci(num_edges);
+
+    raft::update_host(lv_ro.data(),
+                      lgraph_view.get_matrix_partition_view().get_offsets(),
+                      num_vertices + 1,
+                      handle.get_stream());
+    raft::update_host(lv_ci.data(),
+                      lgraph_view.get_matrix_partition_view().get_indices(),
+                      num_edges,
+                      handle.get_stream());
+
+    std::vector<edge_t> rv_ro(num_vertices + 1);
+    std::vector<vertex_t> rv_ci(num_edges);
+
+    raft::update_host(rv_ro.data(),
+                      rgraph_view.get_matrix_partition_view().get_offsets(),
+                      num_vertices + 1,
+                      handle.get_stream());
+    raft::update_host(rv_ci.data(),
+                      rgraph_view.get_matrix_partition_view().get_indices(),
+                      num_edges,
+                      handle.get_stream());
+
+    auto lv_vs = is_weighted ? std::make_optional<std::vector<weight_t>>(num_edges) : std::nullopt;
+    auto rv_vs = is_weighted ? std::make_optional<std::vector<weight_t>>(num_edges) : std::nullopt;
+    if (is_weighted) {
+      raft::update_host((*lv_vs).data(),
+                        *(lgraph_view.get_matrix_partition_view().get_weights()),
+                        num_edges,
+                        handle.get_stream());
+
+      raft::update_host((*rv_vs).data(),
+                        *(rgraph_view.get_matrix_partition_view().get_weights()),
+                        num_edges,
+                        handle.get_stream());
+    }
+
+    handle.get_stream_view().synchronize();
+
+    if (lv_ro != rv_ro) return std::make_pair(false, std::string("offsets"));
+
+    for (size_t i = 0; i < num_vertices; ++i) {
+      auto first = lv_ro[i];
+      auto last  = lv_ro[i + 1];
+      if (is_weighted) {
+        std::vector<std::tuple<vertex_t, weight_t>> lv_pairs(last - first);
+        std::vector<std::tuple<vertex_t, weight_t>> rv_pairs(last - first);
+        for (edge_t j = first; j < last; ++j) {
+          lv_pairs[j - first] = std::make_tuple(lv_ci[j], (*lv_vs)[j]);
+          rv_pairs[j - first] = std::make_tuple(rv_ci[j], (*rv_vs)[j]);
+        }
+        std::sort(lv_pairs.begin(), lv_pairs.end());
+        std::sort(rv_pairs.begin(), rv_pairs.end());
+        if (!std::equal(lv_pairs.begin(), lv_pairs.end(), rv_pairs.begin(), [](auto lhs, auto rhs) {
+              return std::get<0>(lhs) == std::get<0>(rhs);
+            }))
+          return std::make_pair(false, std::string("indices"));
+        if (!std::equal(lv_pairs.begin(), lv_pairs.end(), rv_pairs.begin(), [](auto lhs, auto rhs) {
+              return std::get<1>(lhs) == std::get<1>(rhs);
+            }))
+          return std::make_pair(false, std::string("values"));
+      } else {
+        std::sort(lv_ci.begin() + first, lv_ci.begin() + last);
+        std::sort(rv_ci.begin() + first, rv_ci.begin() + last);
+        if (!std::equal(lv_ci.begin() + first, lv_ci.begin() + last, rv_ci.begin() + first))
+          return std::make_pair(false, std::string("indices"));
+      }
+    }
+
+    if (lgraph_view.get_local_adj_matrix_partition_segment_offsets(0) !=
+        rgraph_view.get_local_adj_matrix_partition_segment_offsets(0))
+      return std::make_pair(false, std::string("segment offsets"));
+
+    return std::make_pair(true, std::string{});
+  }
+}
+
+template <typename vertex_t>
+bool renumbered_vectors_same(raft::handle_t const& handle,
+                             std::vector<vertex_t> const& v1,
+                             std::vector<vertex_t> const& v2)
 {
-  edgelist_from_market_matrix_file_t<vertex_t, weight_t> ret{};
+  if (v1.size() != v2.size()) return false;
 
-  MM_typecode mc{};
-  vertex_t m{};
-  edge_t nnz{};
+  std::map<vertex_t, vertex_t> map;
 
-  FILE* file = fopen(graph_file_full_path.c_str(), "r");
-  CUGRAPH_EXPECTS(file != nullptr, "fopen failure.");
+  auto iter = thrust::make_zip_iterator(thrust::make_tuple(v1.begin(), v2.begin()));
 
-  edge_t tmp_m{};
-  edge_t tmp_k{};
-  auto mm_ret = cugraph::test::mm_properties<edge_t>(file, 1, &mc, &tmp_m, &tmp_k, &nnz);
-  CUGRAPH_EXPECTS(mm_ret == 0, "could not read Matrix Market file properties.");
-  m = static_cast<vertex_t>(tmp_m);
-  CUGRAPH_EXPECTS(mm_is_matrix(mc) && mm_is_coordinate(mc) && !mm_is_complex(mc) && !mm_is_skew(mc),
-                  "invalid Matrix Market file properties.");
+  std::for_each(iter, iter + v1.size(), [&map](auto pair) {
+    vertex_t e1 = thrust::get<0>(pair);
+    vertex_t e2 = thrust::get<1>(pair);
 
-  ret.h_rows.assign(nnz, vertex_t{0});
-  ret.h_cols.assign(nnz, vertex_t{0});
-  ret.h_weights.assign(nnz, weight_t{0.0});
-  ret.number_of_vertices = m;
-  ret.is_symmetric       = mm_is_symmetric(mc);
+    map[e1] = e2;
+  });
 
-  mm_ret = cugraph::test::mm_to_coo<vertex_t, weight_t>(
-    file, 1, nnz, ret.h_rows.data(), ret.h_cols.data(), ret.h_weights.data(), nullptr);
-  CUGRAPH_EXPECTS(mm_ret == 0, "could not read matrix data");
+  auto error_count = std::count_if(iter, iter + v1.size(), [&map](auto pair) {
+    vertex_t e1 = thrust::get<0>(pair);
+    vertex_t e2 = thrust::get<1>(pair);
 
-  auto file_ret = fclose(file);
-  CUGRAPH_EXPECTS(file_ret == 0, "fclose failure.");
+    return (map[e1] != e2);
+  });
 
-  return std::move(ret);
+  return (error_count == 0);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
-cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>
-read_graph_from_matrix_market_file(raft::handle_t const& handle,
-                                   std::string const& graph_file_full_path,
-                                   bool test_weighted)
+template <typename vertex_t>
+bool renumbered_vectors_same(raft::handle_t const& handle,
+                             rmm::device_uvector<vertex_t> const& v1,
+                             rmm::device_uvector<vertex_t> const& v2)
 {
-  auto mm_graph =
-    read_edgelist_from_matrix_market_file<vertex_t, edge_t, weight_t>(graph_file_full_path);
-  edge_t number_of_edges = static_cast<edge_t>(mm_graph.h_rows.size());
-
-  rmm::device_uvector<vertex_t> d_edgelist_rows(number_of_edges, handle.get_stream());
-  rmm::device_uvector<vertex_t> d_edgelist_cols(number_of_edges, handle.get_stream());
-  rmm::device_uvector<weight_t> d_edgelist_weights(test_weighted ? number_of_edges : 0,
-                                                   handle.get_stream());
-
-  raft::update_device(
-    d_edgelist_rows.data(), mm_graph.h_rows.data(), number_of_edges, handle.get_stream());
-  raft::update_device(
-    d_edgelist_cols.data(), mm_graph.h_cols.data(), number_of_edges, handle.get_stream());
-  if (test_weighted) {
-    raft::update_device(
-      d_edgelist_weights.data(), mm_graph.h_weights.data(), number_of_edges, handle.get_stream());
-  }
+  if (v1.size() != v2.size()) return false;
+
+  std::vector<vertex_t> h_v1(v1.size());
+  std::vector<vertex_t> h_v2(v1.size());
+
+  raft::update_host(h_v1.data(), v1.data(), v1.size(), handle.get_stream());
+  raft::update_host(h_v2.data(), v2.data(), v2.size(), handle.get_stream());
 
-  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
-    d_edgelist_rows.data(),
-    d_edgelist_cols.data(),
-    test_weighted ? d_edgelist_weights.data() : nullptr,
-    number_of_edges};
-
-  return cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>(
-    handle,
-    edgelist,
-    mm_graph.number_of_vertices,
-    cugraph::experimental::graph_properties_t{mm_graph.is_symmetric, false},
-    false,
-    true);
+  return renumbered_vectors_same(handle, h_v1, h_v2);
 }
 
 }  // namespace test
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
new file mode 100644
index 00000000000..ae36582d18d
--- /dev/null
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/thrust_wrapper.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/sort.h>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>> sort_by_key(
+  raft::handle_t const& handle, vertex_t const* keys, value_t const* values, size_t num_pairs)
+{
+  rmm::device_uvector<vertex_t> sorted_keys(num_pairs, handle.get_stream_view());
+  rmm::device_uvector<value_t> sorted_values(num_pairs, handle.get_stream_view());
+
+  thrust::copy(
+    rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin());
+  thrust::copy(
+    rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin());
+
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()),
+                      sorted_keys.begin(),
+                      sorted_keys.end(),
+                      sorted_values.begin());
+
+  return std::make_tuple(std::move(sorted_keys), std::move(sorted_values));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+sort_by_key<int32_t, float>(raft::handle_t const& handle,
+                            int32_t const* keys,
+                            float const* values,
+                            size_t num_pairs);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+sort_by_key<int32_t, double>(raft::handle_t const& handle,
+                             int32_t const* keys,
+                             double const* values,
+                             size_t num_pairs);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+sort_by_key<int32_t, int32_t>(raft::handle_t const& handle,
+                              int32_t const* keys,
+                              int32_t const* values,
+                              size_t num_pairs);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+sort_by_key<int64_t, float>(raft::handle_t const& handle,
+                            int64_t const* keys,
+                            float const* values,
+                            size_t num_pairs);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+sort_by_key<int64_t, double>(raft::handle_t const& handle,
+                             int64_t const* keys,
+                             double const* values,
+                             size_t num_pairs);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+sort_by_key<int64_t, int64_t>(raft::handle_t const& handle,
+                              int64_t const* keys,
+                              int64_t const* values,
+                              size_t num_pairs);
+
+template <typename vertex_t>
+void translate_vertex_ids(raft::handle_t const& handle,
+                          rmm::device_uvector<vertex_t>& d_src_v,
+                          rmm::device_uvector<vertex_t>& d_dst_v,
+                          vertex_t vertex_id_offset)
+{
+  thrust::transform(rmm::exec_policy(handle.get_stream()),
+                    d_src_v.begin(),
+                    d_src_v.end(),
+                    d_src_v.begin(),
+                    [offset = vertex_id_offset] __device__(vertex_t v) { return offset + v; });
+
+  thrust::transform(rmm::exec_policy(handle.get_stream()),
+                    d_dst_v.begin(),
+                    d_dst_v.end(),
+                    d_dst_v.begin(),
+                    [offset = vertex_id_offset] __device__(vertex_t v) { return offset + v; });
+}
+
+template <typename vertex_t>
+void populate_vertex_ids(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& d_vertices_v,
+                         vertex_t vertex_id_offset)
+{
+  thrust::sequence(rmm::exec_policy(handle.get_stream()),
+                   d_vertices_v.begin(),
+                   d_vertices_v.end(),
+                   vertex_id_offset);
+}
+
+template void translate_vertex_ids(raft::handle_t const& handle,
+                                   rmm::device_uvector<int32_t>& d_src_v,
+                                   rmm::device_uvector<int32_t>& d_dst_v,
+                                   int32_t vertex_id_offset);
+
+template void translate_vertex_ids(raft::handle_t const& handle,
+                                   rmm::device_uvector<int64_t>& d_src_v,
+                                   rmm::device_uvector<int64_t>& d_dst_v,
+                                   int64_t vertex_id_offset);
+
+template void populate_vertex_ids(raft::handle_t const& handle,
+                                  rmm::device_uvector<int32_t>& d_vertices_v,
+                                  int32_t vertex_id_offset);
+
+template void populate_vertex_ids(raft::handle_t const& handle,
+                                  rmm::device_uvector<int64_t>& d_vertices_v,
+                                  int64_t vertex_id_offset);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
new file mode 100644
index 00000000000..45208a6b921
--- /dev/null
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <tuple>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>> sort_by_key(
+  raft::handle_t const& handle, vertex_t const* keys, value_t const* values, size_t num_pairs);
+
+template <typename vertex_t>
+void translate_vertex_ids(raft::handle_t const& handle,
+                          rmm::device_uvector<vertex_t>& d_src_v,
+                          rmm::device_uvector<vertex_t>& d_dst_v,
+                          vertex_t vertex_id_offset);
+
+template <typename vertex_t>
+void populate_vertex_ids(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& d_vertices_v,
+                         vertex_t vertex_id_offset);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/visitors/bfs_test.cpp b/cpp/tests/visitors/bfs_test.cpp
new file mode 100644
index 00000000000..0e216683b0c
--- /dev/null
+++ b/cpp/tests/visitors/bfs_test.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/experimental/graph.hpp>
+#include <cugraph/experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+// visitor artifacts:
+//
+#include <cugraph/visitors/bfs_visitor.hpp>
+#include <cugraph/visitors/erased_api.hpp>
+#include <cugraph/visitors/erased_pack.hpp>
+#include <cugraph/visitors/graph_envelope.hpp>
+#include <cugraph/visitors/ret_terased.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iterator>
+#include <limits>
+#include <vector>
+
+template <typename vertex_t, typename edge_t>
+void bfs_reference(edge_t* offsets,
+                   vertex_t* indices,
+                   vertex_t* distances,
+                   vertex_t* predecessors,
+                   vertex_t num_vertices,
+                   vertex_t source,
+                   vertex_t depth_limit = std::numeric_limits<vertex_t>::max())
+{
+  vertex_t depth{0};
+
+  std::fill(distances, distances + num_vertices, std::numeric_limits<vertex_t>::max());
+  std::fill(predecessors,
+            predecessors + num_vertices,
+            cugraph::experimental::invalid_vertex_id<vertex_t>::value);
+
+  *(distances + source) = depth;
+  std::vector<vertex_t> cur_frontier_rows{source};
+  std::vector<vertex_t> new_frontier_rows{};
+
+  while (cur_frontier_rows.size() > 0) {
+    for (auto const row : cur_frontier_rows) {
+      auto nbr_offset_first = *(offsets + row);
+      auto nbr_offset_last  = *(offsets + row + 1);
+      for (auto nbr_offset = nbr_offset_first; nbr_offset != nbr_offset_last; ++nbr_offset) {
+        auto nbr = *(indices + nbr_offset);
+        if (*(distances + nbr) == std::numeric_limits<vertex_t>::max()) {
+          *(distances + nbr)    = depth + 1;
+          *(predecessors + nbr) = row;
+          new_frontier_rows.push_back(nbr);
+        }
+      }
+    }
+    std::swap(cur_frontier_rows, new_frontier_rows);
+    new_frontier_rows.clear();
+    ++depth;
+    if (depth >= depth_limit) { break; }
+  }
+
+  return;
+}
+
+typedef struct BFS_Usecase_t {
+  std::string graph_file_full_path{};
+  size_t source{false};
+
+  BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} BFS_Usecase;
+
+class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
+ public:
+  Tests_BFS() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(BFS_Usecase const& configuration)
+  {
+    using namespace cugraph::experimental;
+    using namespace cugraph::visitors;
+
+    using weight_t = float;
+
+    raft::handle_t handle{};
+
+    bool test_weighted = false;
+
+    // extract graph data from graph matrix file:
+    //
+    auto&& [d_src, d_dst, opt_d_w, num_vertices, is_sym] =
+      cugraph::test::read_edgelist_from_matrix_market_file<vertex_t, weight_t>(
+        handle, configuration.graph_file_full_path, test_weighted);
+
+    graph_properties_t graph_props{is_sym, false};
+    edge_t num_edges = d_dst.size();
+
+    std::optional<weight_t const*> opt_ptr_w;
+    if (opt_d_w.has_value()) { opt_ptr_w = opt_d_w->data(); }
+
+    // to be filled:
+    //
+    cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+      d_src.data(), d_dst.data(), opt_ptr_w, num_edges};
+    bool sorted{false};
+    bool check{false};
+
+    erased_pack_t ep_graph{&handle, &edgelist, &num_vertices, &graph_props, &sorted, &check};
+
+    DTypes vertex_tid = reverse_dmap_t<vertex_t>::type_id;
+    DTypes edge_tid   = reverse_dmap_t<edge_t>::type_id;
+    DTypes weight_tid = reverse_dmap_t<weight_t>::type_id;
+    bool st           = false;
+    bool mg           = false;
+    GTypes graph_tid  = GTypes::GRAPH_T;
+
+    graph_envelope_t graph_envelope{vertex_tid, edge_tid, weight_tid, st, mg, graph_tid, ep_graph};
+
+    auto const* p_graph = dynamic_cast<graph_t<vertex_t, edge_t, weight_t, false, false> const*>(
+      graph_envelope.graph().get());
+
+    auto graph_view = p_graph->view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    raft::update_host(h_offsets.data(),
+                      graph_view.get_matrix_partition_view().get_offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.get_matrix_partition_view().get_indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(configuration.source >= 0 &&
+                configuration.source <= graph_view.get_number_of_vertices())
+      << "Starting sources should be >= 0 and"
+      << " less than the number of vertices in the graph.";
+
+    std::vector<vertex_t> h_reference_distances(graph_view.get_number_of_vertices());
+    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
+
+    bfs_reference(h_offsets.data(),
+                  h_indices.data(),
+                  h_reference_distances.data(),
+                  h_reference_predecessors.data(),
+                  graph_view.get_number_of_vertices(),
+                  static_cast<vertex_t>(configuration.source),
+                  std::numeric_limits<vertex_t>::max());
+
+    rmm::device_uvector<vertex_t> d_distances(graph_view.get_number_of_vertices(),
+                                              handle.get_stream());
+    rmm::device_uvector<vertex_t> d_predecessors(graph_view.get_number_of_vertices(),
+                                                 handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    {
+      // visitors version:
+      //
+      // using namespace cugraph::experimental;
+
+      // in a context where dependent types are known,
+      // type-erasing the graph is not necessary,
+      // hence the `<alg>_wrapper()` is not necessary;
+      //
+
+      // packing visitor arguments = bfs algorithm arguments
+      //
+      vertex_t* p_d_dist   = d_distances.begin();
+      vertex_t* p_d_predec = d_predecessors.begin();
+      auto src             = static_cast<vertex_t>(configuration.source);
+      bool dir_opt{false};
+      auto depth_l = std::numeric_limits<vertex_t>::max();
+      bool check{false};
+      erased_pack_t ep{
+        &handle, p_d_dist, p_d_predec, &src, &dir_opt, &depth_l, &check};  // args for bfs()
+
+      // several options to run the BFS algorithm:
+      //
+      // (1.) if a graph object already exists,
+      //      we can use it to make the appropriate
+      //      visitor:
+      //
+      // auto v_uniq_ptr = make_visitor(
+      //   *p_graph,
+      //   [](graph_envelope_t::visitor_factory_t const& vfact, erased_pack_t& parg) {
+      //     return vfact.make_bfs_visitor(parg);
+      //   },
+      //   ep);
+      // p_graph->apply(*v_uniq_ptr);
+
+      // (2.) if a graph object already exists, alternatively we can
+      //      explicitly instantiate the factory and call its make method:
+      //
+      // dependent_factory_t<vertex_t, edge_t, weight_t, false, false> visitor_factory{}; // okay
+      // auto v_uniq_ptr = visitor_factory.make_bfs_visitor(ep); // okay
+      // p_graph->apply(*v_uniq_ptr);
+
+      // (3.) if only the `graph_envelope_t` object exists,
+      //      we can invoke the algorithm via the wrapper:
+      //
+      return_t ret = cugraph::api::bfs(graph_envelope, ep);
+    }
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<vertex_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+
+    raft::update_host(
+      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+    raft::update_host(h_cugraph_predecessors.data(),
+                      d_predecessors.data(),
+                      d_predecessors.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(std::equal(
+      h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin()))
+      << "distances do not match with the reference values.";
+
+    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+      auto i = std::distance(h_cugraph_predecessors.begin(), it);
+      if (*it == cugraph::experimental::invalid_vertex_id<vertex_t>::value) {
+        ASSERT_TRUE(h_reference_predecessors[i] == *it)
+          << "vertex reachability do not match with the reference.";
+      } else {
+        ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i])
+          << "distance to this vertex != distance to the predecessor vertex + 1.";
+        bool found{false};
+        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+          if (h_indices[j] == i) {
+            found = true;
+            break;
+          }
+        }
+        ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
+      }
+    }
+  }
+};
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_BFS, CheckInt32Int32) { run_current_test<int32_t, int32_t>(GetParam()); }
+
+INSTANTIATE_TEST_CASE_P(simple_test,
+                        Tests_BFS,
+                        ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
+                                          BFS_Usecase("test/datasets/polbooks.mtx", 0),
+                                          BFS_Usecase("test/datasets/netscience.mtx", 0),
+                                          BFS_Usecase("test/datasets/netscience.mtx", 100),
+                                          BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
+                                          BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/datasets/README.md b/datasets/README.md
index c7f76a91dfe..e42413fc996 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -1,67 +1,132 @@
-# Cugraph test and benchmark data
-
-## Python
-
-This directory contains small public datasets in `mtx` and `csv` format used by cuGraph's python tests. Graph details:
-
-| Graph         | V     | E     | Directed | Weighted |
-| ------------- | ----- | ----- | -------- | -------- |
-| karate        | 34    | 156   | No       | No       |
-| dolphin       | 62    | 318   | No       | No       |
-| netscience    | 1,589 | 5,484 | No       | Yes      |
-
-**karate** : The graph "karate" contains the network of friendships between the 34 members of a karate club at a US university, as described by Wayne Zachary in 1977.
-
-**dolphin** : The graph dolphins contains an undirected social network of frequent associations between 62 dolphins in a community living off Doubtful Sound, New Zealand, as compiled by Lusseau et al. (2003).
-
-**netscience** : The graph netscience contains a coauthorship network of scientists working on network theory and experiment, as compiled by M. Newman in May 2006.
-
-## C++
-Cugraph's C++ analytics tests need larger datasets (>5GB uncompressed) and reference results (>125MB uncompressed). They can be downloaded by running the provided script from the `datasets` directory.
-```
-cd <repo>/datasets
-./get_test_data.sh
-```
-You may run this script from elsewhere and store C++ test input to another location.
-
-Before running the tests, you should let cuGraph know where to find the test input by using:
-```
-export RAPIDS_DATASET_ROOT_DIR=<path_to_ccp_test_and_reference_data>
-```
-
-## Benchmarks
-Cugraph benchmarks (which can be found [here](../benchmarks)) also use datasets installed to this folder. Because the datasets used for benchmarking are also quite large (~14GB uncompressed), they are not installed by default. To install datasets for benchmarks, run the same script shown above from the `datasets` directory using the `--benchmark` option:
-```
-cd <repo>/datasets
-./get_test_data.sh --benchmark
-```
-The datasets installed for benchmarks currently include CSV files for use in creating both directed and undirected graphs:
-```
-<repo>/datasets/csv
- |- directed
- |--- cit-Patents.csv       (250M)
- |--- soc-LiveJournal1.csv  (965M)
- |- undirected
- |--- europe_osm.csv        (1.8G)
- |--- hollywood.csv         (1.5G)
- |--- soc-twitter-2010.csv  (8.8G)
-```
-The benchmark datasets are described below:
-| Graph             | V          | E             | Directed | Weighted |
-| ----------------- | ---------- | ------------- | -------- | -------- |
-| cit-Patents       |  3,774,768 |    16,518,948 | Yes      | No       |
-| soc-LiveJournal1  |  4,847,571 |    43,369,619 | Yes      | No       |
-| europe_osm        | 50,912,018 |    54,054,660 | No       | No       |
-| hollywood         |  1,139,905 |    57,515,616 | No       | No       |
-| soc-twitter-2010  | 21,297,772 |   265,025,809 | No       | No       |
-
-**cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
-**soc-LiveJournal** : A graph of the LiveJournal social network.
-**europe_osm** : A graph of OpenStreetMap data for Europe.
-**hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together.
-**soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
-
-_NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._
-
-## Reference
-The SuiteSparse Matrix Collection (formerly the University of Florida Sparse Matrix Collection) : https://sparse.tamu.edu/
+# Cugraph test and benchmark data
+
+## Python
+
+This directory contains small public datasets in `mtx` and `csv` format used by cuGraph's python tests. Graph details:
+
+| Graph         | V     | E     | Directed | Weighted |
+| ------------- | ----- | ----- | -------- | -------- |
+| karate        | 34    | 156   | No       | No       |
+| dolphin       | 62    | 318   | No       | No       |
+| netscience    | 1,589 | 5,484 | No       | Yes      |
+
+**karate** : The graph "karate" contains the network of friendships between the 34 members of a karate club at a US university, as described by Wayne Zachary in 1977.
+
+**dolphin** : The graph dolphins contains an undirected social network of frequent associations between 62 dolphins in a community living off Doubtful Sound, New Zealand, as compiled by Lusseau et al. (2003).
+
+**netscience** : The graph netscience contains a coauthorship network of scientists working on network theory and experiment, as compiled by M. Newman in May 2006.
+
+
+
+### Modified datasets 
+
+The datasets below were added to provide input that contains self-loops, string vertex IDs, isolated vertices, and multiple edges.
+
+| Graph               | V       | E          | Directed | Weighted  | self-loops | Isolated V | String V IDs | Multi-edges | 
+| ------------------- | ------- | ---------- | -------- | --------- | ---------- | ---------- | ------------ | ----------- |
+| karate_multi_edge   | 34      | 160        | No       | Yes       | No         | No         | No           | Yes         |
+| dolphins_multi_edge | 62      | 325        | No       | Yes       | No         | No         | No           | Yes         |
+| karate_s_loop       | 34      | 160        | No       | Yes       | Yes        | No         | No           | No          |
+| dolphins_s_loop     | 62      | 321        | No       | Yes       | Yes        | No         | No           | No          |
+| karate_mod          | 37      | 156        | No       | No        | No         | Yes        | No           | No          |
+| karate_str          | 34      | 156        | No       | Yes       | No         | No         | Yes          | No          |
+
+**karate_multi_edge** : The graph "karate_multi_edge" is a modified version of the  "karate" graph where multi-edges were added
+
+**dolphins_multi_edge** : The graph "dolphins_multi_edge" is a modified version of the  "dolphin" graph where multi-edges were added
+
+**karate_s_loop** : The graph "karate_s_loop" is a modified version of the  "karate" graph where self-loops were added
+
+**dolphins_s_loop** : The graph "dolphins_s_loop" is a modified version of the  "dolphin" graph where self-loops were added
+
+**karate_mod** : The graph "karate_mod" is a modified version of the  "karate" graph where vertices and edges were added
+
+**karate_str** : The graph "karate_str" contains the network of friendships between the 34 members of a karate club at a US university, as described by Wayne Zachary in 1977. The integer vertices were replaced by strings
+
+
+### Additional datasets
+
+Larger datasets containing self-loops can be downloaded by running the provided script from the `datasets` directory using the `--self_loops` 
+option: 
+```
+cd <repo>/datasets
+./get_test_data.sh --self_loops
+```
+```
+<repo>/datasets/self_loops
+ |-ca-AstroPh  (5.3M) 
+ |-ca-CondMat  (2.8M)
+ |-ca-GrQc     (348K)
+ |-ca-HepTh    (763K)
+```
+These datasets are not currently used by any tests or benchmarks
+
+| Graph         | V       | E          | Directed | Weighted | self-loops | Isolated V | String V IDs | Multi-edges |  
+| ------------- | ------- | --------   | -------- | -------- | ---------- | ---------- | ------------ | ----------- |
+| ca-AstroPh    | 18,772  | 198,110    | No       | No       | Yes        | No         | No           | No          |
+| ca-CondMat    | 23,133  | 93,497     | No       | Yes      | Yes        | No         | No           | No          |
+| ca-GrQc       | 5,242   | 14,387     | No       | No       | Yes        | No         | No           | No          |
+| ca-HepTh      | 9,877   | 25,998     | No       | Yes      | Yes        | No         | No           | No          |
+
+**ca-AstroPh** : The graph "ca-AstroPh" covers scientific collaborations between authors papers submitted to Astro Physics category in the period from January 1993 to April 2003 (124 months), as described by J. Leskovec, J. Kleinberg and C. Faloutsos in 2007.
+
+**ca-CondMat** : The graph "ca-CondMat" covers scientific collaborations between authors papers submitted to Condense Matter category in the period from January 1993 to April 2003 (124 months), as described by J. Leskovec, J. Kleinberg and C. Faloutsos in 2007.
+
+**ca-GrQc** : The graph "ca-GrQc" covers scientific collaborations between authors papers submitted to General Relativity and Quantum Cosmology category in the period from January 1993 to April 2003 (124 months), as described by J. Leskovec, J. Kleinberg and C. Faloutsos in 2007.
+
+**ca-HepTh** : The graph "ca-HepTh" covers scientific collaborations between authors papers submitted to High Energy Physics - Theory category in the period from January 1993 to April 2003 (124 months), as described by J. Leskovec, J. Kleinberg and C. Faloutsos in 2007.
+
+
+## Custom path to larger datasets directory  
+
+Cugraph's C++ and Python analytics tests need larger datasets (>5GB uncompressed) and reference results (>125MB uncompressed). They can be downloaded by running the provided script from the `datasets` directory.
+```
+cd <repo>/datasets
+./get_test_data.sh
+```
+You may run this script from elsewhere and store C++ or Python test input to another location.
+
+Before running the tests, you should let cuGraph know where to find the test input by using:
+```
+export RAPIDS_DATASET_ROOT_DIR=<path_to_datasets_dir>
+```
+
+
+## Benchmarks
+
+Cugraph benchmarks (which can be found [here](../benchmarks)) also use datasets installed to this folder. Because the datasets used for benchmarking are also quite large (~14GB uncompressed), they are not installed by default. To install datasets for benchmarks, run the same script shown above from the `datasets` directory using the `--benchmark` option:
+```
+cd <repo>/datasets
+./get_test_data.sh --benchmark
+```
+The datasets installed for benchmarks currently include CSV files for use in creating both directed and undirected graphs:
+```
+<repo>/datasets/csv
+ |- directed
+ |--- cit-Patents.csv       (250M)
+ |--- soc-LiveJournal1.csv  (965M)
+ |- undirected
+ |--- europe_osm.csv        (1.8G)
+ |--- hollywood.csv         (1.5G)
+ |--- soc-twitter-2010.csv  (8.8G)
+```
+The benchmark datasets are described below:
+| Graph             | V          | E             | Directed | Weighted |
+| ----------------- | ---------- | ------------- | -------- | -------- |
+| cit-Patents       |  3,774,768 |    16,518,948 | Yes      | No       |
+| soc-LiveJournal1  |  4,847,571 |    43,369,619 | Yes      | No       |
+| europe_osm        | 50,912,018 |    54,054,660 | No       | No       |
+| hollywood         |  1,139,905 |    57,515,616 | No       | No       |
+| soc-twitter-2010  | 21,297,772 |   265,025,809 | No       | No       |
+
+**cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+**soc-LiveJournal** : A graph of the LiveJournal social network.
+**europe_osm** : A graph of OpenStreetMap data for Europe.
+**hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together.
+**soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
+
+_NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._
+
+## Reference
+The SuiteSparse Matrix Collection (formerly the University of Florida Sparse Matrix Collection) : https://sparse.tamu.edu/
+The Stanford Network Analysis Platform (SNAP) 
diff --git a/datasets/dolphins_multi_edge.csv b/datasets/dolphins_multi_edge.csv
new file mode 100644
index 00000000000..cf6bc70918e
--- /dev/null
+++ b/datasets/dolphins_multi_edge.csv
@@ -0,0 +1,325 @@
+10 0 1.0
+14 0 1.0
+15 0 1.0
+40 0 1.0
+42 0 1.0
+47 0 1.0
+17 1 1.0
+19 1 1.0
+26 1 1.0
+27 1 1.0
+28 1 1.0
+36 1 1.0
+41 1 1.0
+54 1 1.0
+10 2 1.0
+42 2 1.0
+44 2 1.0
+61 2 1.0
+8 3 1.0
+14 3 1.0
+59 3 1.0
+51 4 1.0
+9 5 1.0
+13 5 1.0
+56 5 1.0
+57 5 1.0
+9 6 1.0
+13 6 1.0
+17 6 1.0
+54 6 1.0
+56 6 1.0
+57 6 1.0
+19 7 1.0
+27 7 1.0
+30 7 1.0
+40 7 1.0
+54 7 1.0
+20 8 1.0
+28 8 1.0
+37 8 1.0
+45 8 1.0
+59 8 1.0
+13 9 1.0
+17 9 1.0
+32 9 1.0
+41 9 1.0
+57 9 1.0
+29 10 1.0
+42 10 1.0
+47 10 1.0
+51 11 1.0
+33 12 1.0
+17 13 1.0
+32 13 1.0
+41 13 1.0
+54 13 1.0
+57 13 1.0
+16 14 1.0
+24 14 1.0
+33 14 1.0
+34 14 1.0
+37 14 1.0
+38 14 1.0
+40 14 1.0
+43 14 1.0
+50 14 1.0
+52 14 1.0
+18 15 1.0
+24 15 1.0
+40 15 1.0
+45 15 1.0
+55 15 1.0
+59 15 1.0
+20 16 1.0
+33 16 1.0
+37 16 1.0
+38 16 1.0
+50 16 1.0
+22 17 1.0
+25 17 1.0
+27 17 1.0
+31 17 1.0
+57 17 1.0
+20 18 1.0
+21 18 1.0
+24 18 1.0
+29 18 1.0
+45 18 1.0
+51 18 1.0
+30 19 1.0
+54 19 1.0
+28 20 1.0
+36 20 1.0
+38 20 1.0
+44 20 1.0
+47 20 1.0
+50 20 1.0
+29 21 1.0
+33 21 1.0
+37 21 1.0
+45 21 1.0
+51 21 1.0
+36 23 1.0
+45 23 1.0
+51 23 1.0
+29 24 1.0
+45 24 1.0
+51 24 1.0
+26 25 1.0
+27 25 1.0
+27 26 1.0
+30 28 1.0
+47 28 1.0
+35 29 1.0
+43 29 1.0
+45 29 1.0
+51 29 1.0
+52 29 1.0
+42 30 1.0
+47 30 1.0
+60 32 1.0
+34 33 1.0
+37 33 1.0
+38 33 1.0
+40 33 1.0
+43 33 1.0
+50 33 1.0
+37 34 1.0
+44 34 1.0
+49 34 1.0
+37 36 1.0
+39 36 1.0
+40 36 1.0
+59 36 1.0
+40 37 1.0
+43 37 1.0
+45 37 1.0
+61 37 1.0
+43 38 1.0
+44 38 1.0
+52 38 1.0
+58 38 1.0
+57 39 1.0
+52 40 1.0
+54 41 1.0
+54 41 1.0
+57 41 1.0
+47 42 1.0
+50 42 1.0
+50 42 1.0
+46 43 1.0
+53 43 1.0
+50 45 1.0
+51 45 1.0
+59 45 1.0
+59 45 1.0
+49 46 1.0
+57 48 1.0
+51 50 1.0
+55 51 1.0
+61 53 1.0
+57 54 1.0
+0 10 1.0
+0 14 1.0
+0 15 1.0
+59 45 1.0
+0 40 1.0
+0 42 1.0
+0 47 1.0
+1 17 1.0
+1 19 1.0
+1 26 1.0
+1 27 1.0
+1 28 1.0
+1 36 1.0
+1 41 1.0
+1 54 1.0
+2 10 1.0
+2 42 1.0
+2 44 1.0
+2 61 1.0
+54 41 1.0
+3 8 1.0
+3 14 1.0
+3 59 1.0
+4 51 1.0
+56 6 1.0
+5 9 1.0
+5 13 1.0
+5 56 1.0
+5 57 1.0
+6 9 1.0
+6 13 1.0
+6 17 1.0
+6 54 1.0
+6 56 1.0
+6 57 1.0
+7 19 1.0
+7 27 1.0
+7 30 1.0
+7 40 1.0
+7 54 1.0
+8 20 1.0
+8 28 1.0
+8 37 1.0
+8 45 1.0
+2 61 1.0
+8 59 1.0
+9 13 1.0
+9 17 1.0
+9 32 1.0
+9 41 1.0
+9 57 1.0
+10 29 1.0
+10 42 1.0
+10 47 1.0
+11 51 1.0
+12 33 1.0
+13 17 1.0
+13 32 1.0
+13 41 1.0
+13 54 1.0
+13 57 1.0
+14 16 1.0
+14 24 1.0
+14 33 1.0
+14 34 1.0
+14 37 1.0
+14 38 1.0
+14 40 1.0
+14 43 1.0
+14 50 1.0
+14 52 1.0
+15 18 1.0
+15 24 1.0
+15 40 1.0
+15 45 1.0
+15 55 1.0
+15 59 1.0
+16 20 1.0
+16 33 1.0
+16 37 1.0
+16 38 1.0
+16 50 1.0
+17 22 1.0
+17 25 1.0
+17 27 1.0
+17 31 1.0
+17 57 1.0
+18 20 1.0
+18 21 1.0
+18 24 1.0
+18 29 1.0
+18 45 1.0
+18 51 1.0
+19 30 1.0
+19 54 1.0
+20 28 1.0
+20 36 1.0
+20 38 1.0
+20 44 1.0
+20 47 1.0
+20 50 1.0
+21 29 1.0
+21 33 1.0
+21 37 1.0
+21 45 1.0
+21 51 1.0
+23 36 1.0
+23 45 1.0
+23 51 1.0
+24 29 1.0
+24 45 1.0
+24 51 1.0
+25 26 1.0
+25 27 1.0
+26 27 1.0
+28 30 1.0
+28 47 1.0
+29 35 1.0
+29 43 1.0
+29 45 1.0
+29 51 1.0
+29 52 1.0
+30 42 1.0
+30 47 1.0
+32 60 1.0
+33 34 1.0
+33 37 1.0
+33 38 1.0
+33 40 1.0
+33 43 1.0
+33 50 1.0
+34 37 1.0
+34 44 1.0
+34 49 1.0
+36 37 1.0
+36 39 1.0
+36 40 1.0
+36 59 1.0
+37 40 1.0
+37 43 1.0
+37 45 1.0
+37 61 1.0
+38 43 1.0
+38 44 1.0
+38 52 1.0
+38 58 1.0
+39 57 1.0
+40 52 1.0
+41 54 1.0
+41 57 1.0
+42 47 1.0
+42 50 1.0
+43 46 1.0
+43 53 1.0
+45 50 1.0
+45 51 1.0
+45 59 1.0
+46 49 1.0
+48 57 1.0
+50 51 1.0
+51 55 1.0
+53 61 1.0
+54 57 1.0
diff --git a/datasets/dolphins_s_loop.csv b/datasets/dolphins_s_loop.csv
new file mode 100644
index 00000000000..703b8440afa
--- /dev/null
+++ b/datasets/dolphins_s_loop.csv
@@ -0,0 +1,321 @@
+10 0 1.0
+14 0 1.0
+15 0 1.0
+40 0 1.0
+42 0 1.0
+47 0 1.0
+17 1 1.0
+19 1 1.0
+26 1 1.0
+27 1 1.0
+28 1 1.0
+36 1 1.0
+41 1 1.0
+54 1 1.0
+10 2 1.0
+42 2 1.0
+44 2 1.0
+61 2 1.0
+8 3 1.0
+14 3 1.0
+59 3 1.0
+51 4 1.0
+9 5 1.0
+13 5 1.0
+56 5 1.0
+57 5 1.0
+9 6 1.0
+13 6 1.0
+17 6 1.0
+54 6 1.0
+56 6 1.0
+57 6 1.0
+19 7 1.0
+27 7 1.0
+30 7 1.0
+40 7 1.0
+54 7 1.0
+20 8 1.0
+28 8 1.0
+37 8 1.0
+45 8 1.0
+59 8 1.0
+13 9 1.0
+17 9 1.0
+32 9 1.0
+41 9 1.0
+57 9 1.0
+29 10 1.0
+42 10 1.0
+47 10 1.0
+51 11 1.0
+33 12 1.0
+17 13 1.0
+32 13 1.0
+41 13 1.0
+54 13 1.0
+57 13 1.0
+16 14 1.0
+24 14 1.0
+33 14 1.0
+34 14 1.0
+37 14 1.0
+38 14 1.0
+40 14 1.0
+43 14 1.0
+50 14 1.0
+52 14 1.0
+18 15 1.0
+24 15 1.0
+40 15 1.0
+45 15 1.0
+55 15 1.0
+59 15 1.0
+20 16 1.0
+33 16 1.0
+37 16 1.0
+38 16 1.0
+50 16 1.0
+22 17 1.0
+25 17 1.0
+27 17 1.0
+31 17 1.0
+57 17 1.0
+20 18 1.0
+21 18 1.0
+24 18 1.0
+29 18 1.0
+45 18 1.0
+51 18 1.0
+30 19 1.0
+54 19 1.0
+28 20 1.0
+36 20 1.0
+38 20 1.0
+44 20 1.0
+47 20 1.0
+50 20 1.0
+29 21 1.0
+33 21 1.0
+37 21 1.0
+45 21 1.0
+51 21 1.0
+36 23 1.0
+45 23 1.0
+51 23 1.0
+29 24 1.0
+45 24 1.0
+51 24 1.0
+26 25 1.0
+27 25 1.0
+27 26 1.0
+30 28 1.0
+47 28 1.0
+35 29 1.0
+43 29 1.0
+45 29 1.0
+51 29 1.0
+52 29 1.0
+42 30 1.0
+47 30 1.0
+60 32 1.0
+34 33 1.0
+37 33 1.0
+38 33 1.0
+40 33 1.0
+43 33 1.0
+50 33 1.0
+37 34 1.0
+44 34 1.0
+49 34 1.0
+37 36 1.0
+39 36 1.0
+40 36 1.0
+59 36 1.0
+40 37 1.0
+43 37 1.0
+43 43 1.0
+45 37 1.0
+61 37 1.0
+43 38 1.0
+44 38 1.0
+52 38 1.0
+58 38 1.0
+57 39 1.0
+52 40 1.0
+52 52 1.0
+54 41 1.0
+57 41 1.0
+47 42 1.0
+50 42 1.0
+46 43 1.0
+53 43 1.0
+50 45 1.0
+51 45 1.0
+59 45 1.0
+49 46 1.0
+57 48 1.0
+51 50 1.0
+55 51 1.0
+61 53 1.0
+57 54 1.0
+0 10 1.0
+0 14 1.0
+0 15 1.0
+0 40 1.0
+0 42 1.0
+0 47 1.0
+1 17 1.0
+1 19 1.0
+1 26 1.0
+1 1 1.0
+1 27 1.0
+1 28 1.0
+1 36 1.0
+1 41 1.0
+1 54 1.0
+2 10 1.0
+2 42 1.0
+2 44 1.0
+2 61 1.0
+3 8 1.0
+3 14 1.0
+3 59 1.0
+4 51 1.0
+5 9 1.0
+5 13 1.0
+5 56 1.0
+5 57 1.0
+6 9 1.0
+6 13 1.0
+6 17 1.0
+6 54 1.0
+6 56 1.0
+6 57 1.0
+7 19 1.0
+7 27 1.0
+7 30 1.0
+7 40 1.0
+7 54 1.0
+8 20 1.0
+8 28 1.0
+8 37 1.0
+8 45 1.0
+8 59 1.0
+9 13 1.0
+9 17 1.0
+9 32 1.0
+9 41 1.0
+9 57 1.0
+10 29 1.0
+10 42 1.0
+10 47 1.0
+11 51 1.0
+12 33 1.0
+13 17 1.0
+13 32 1.0
+13 41 1.0
+13 54 1.0
+13 57 1.0
+14 16 1.0
+14 24 1.0
+14 33 1.0
+14 34 1.0
+14 37 1.0
+14 38 1.0
+14 40 1.0
+14 43 1.0
+14 50 1.0
+14 52 1.0
+15 18 1.0
+15 24 1.0
+15 40 1.0
+15 45 1.0
+15 55 1.0
+15 59 1.0
+16 20 1.0
+16 33 1.0
+16 37 1.0
+16 38 1.0
+16 50 1.0
+17 22 1.0
+17 25 1.0
+17 27 1.0
+17 31 1.0
+17 57 1.0
+18 20 1.0
+18 21 1.0
+18 24 1.0
+18 29 1.0
+18 45 1.0
+18 51 1.0
+19 30 1.0
+19 54 1.0
+20 28 1.0
+20 36 1.0
+20 38 1.0
+20 44 1.0
+20 47 1.0
+20 50 1.0
+21 29 1.0
+21 33 1.0
+21 37 1.0
+21 45 1.0
+21 51 1.0
+23 36 1.0
+23 45 1.0
+23 51 1.0
+24 29 1.0
+24 45 1.0
+24 51 1.0
+25 26 1.0
+25 27 1.0
+26 27 1.0
+28 30 1.0
+28 47 1.0
+29 35 1.0
+29 43 1.0
+29 45 1.0
+29 51 1.0
+29 52 1.0
+30 42 1.0
+30 47 1.0
+32 60 1.0
+33 34 1.0
+33 37 1.0
+33 38 1.0
+33 40 1.0
+33 43 1.0
+33 50 1.0
+34 37 1.0
+34 44 1.0
+34 49 1.0
+36 37 1.0
+36 39 1.0
+36 40 1.0
+36 59 1.0
+37 40 1.0
+37 43 1.0
+37 45 1.0
+37 61 1.0
+38 43 1.0
+38 44 1.0
+38 52 1.0
+38 58 1.0
+39 57 1.0
+40 52 1.0
+41 54 1.0
+41 57 1.0
+42 47 1.0
+42 50 1.0
+43 46 1.0
+43 53 1.0
+45 50 1.0
+45 51 1.0
+45 59 1.0
+46 49 1.0
+48 57 1.0
+50 51 1.0
+51 55 1.0
+53 61 1.0
+54 57 1.0
diff --git a/datasets/eil51.tsp b/datasets/eil51.tsp
new file mode 100644
index 00000000000..543d1013c14
--- /dev/null
+++ b/datasets/eil51.tsp
@@ -0,0 +1,58 @@
+NAME : eil51
+COMMENT : 51-city problem (Christofides/Eilon)
+TYPE : TSP
+DIMENSION : 51
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 37 52
+2 49 49
+3 52 64
+4 20 26
+5 40 30
+6 21 47
+7 17 63
+8 31 62
+9 52 33
+10 51 21
+11 42 41
+12 31 32
+13 5 25
+14 12 42
+15 36 16
+16 52 41
+17 27 23
+18 17 33
+19 13 13
+20 57 58
+21 62 42
+22 42 57
+23 16 57
+24 8 52
+25 7 38
+26 27 68
+27 30 48
+28 43 67
+29 58 48
+30 58 27
+31 37 69
+32 38 46
+33 46 10
+34 61 33
+35 62 63
+36 63 69
+37 32 22
+38 45 35
+39 59 15
+40 5 6
+41 10 17
+42 21 10
+43 5 64
+44 30 15
+45 39 10
+46 32 39
+47 25 32
+48 25 55
+49 48 28
+50 56 37
+51 30 40
+EOF
diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh
index 071a4b8dea3..0bd97b55cb5 100755
--- a/datasets/get_test_data.sh
+++ b/datasets/get_test_data.sh
@@ -1,3 +1,16 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #!/bin/bash
 set -e
 set -o pipefail
@@ -31,6 +44,10 @@ benchmark
 # ~1s download
 https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_small.tgz
 benchmark
+
+# ~0.6s download
+https://rapidsai-data.s3.us-east-2.amazonaws.com/cugraph/test/tsplib/datasets.tar.gz
+tsplib
 "
 
 EXTENDED_DATASET_DATA="
@@ -44,6 +61,12 @@ BENCHMARK_DATASET_DATA="
 https://rapidsai-data.s3.us-east-2.amazonaws.com/cugraph/benchmark/benchmark_csv_data.tgz
 csv
 "
+
+SELF_LOOPS_DATASET_DATA="
+# ~1s download 
+https://rapidsai-data.s3.us-east-2.amazonaws.com/cugraph/benchmark/benchmark_csv_data_self_loops.tgz
+self_loops
+"
 ################################################################################
 # Do not change the script below this line if only adding/updating a dataset
 
@@ -54,7 +77,7 @@ function hasArg {
 }
 
 if hasArg -h || hasArg --help; then
-    echo "$0 [--subset | --benchmark]"
+    echo "$0 [--subset | --benchmark | --self_loops]"
     exit 0
 fi
 
@@ -63,6 +86,8 @@ if hasArg "--benchmark"; then
     DATASET_DATA="${BENCHMARK_DATASET_DATA}"
 elif hasArg "--subset"; then
     DATASET_DATA="${BASE_DATASET_DATA}"
+elif hasArg "--self_loops"; then 
+    DATASET_DATA="${SELF_LOOPS_DATASET_DATA}"
 # Do not include benchmark datasets by default - too big
 else
     DATASET_DATA="${BASE_DATASET_DATA} ${EXTENDED_DATASET_DATA}"
diff --git a/datasets/gil262.tsp b/datasets/gil262.tsp
new file mode 100755
index 00000000000..cfcb15c3b78
--- /dev/null
+++ b/datasets/gil262.tsp
@@ -0,0 +1,269 @@
+NAME : gil262
+COMMENT : 262-city problem (Gillet/Johnson)
+TYPE : TSP
+DIMENSION : 262
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 -99 -97
+2 -59 50
+3 0 14
+4 -17 -66
+5 -69 -19
+6 31 12
+7 5 -41
+8 -12 10
+9 -64 70
+10 -12 85
+11 -18 64
+12 -77 -16
+13 -53 88
+14 83 -24
+15 24 41
+16 17 21
+17 42 96
+18 -65 0
+19 -47 -26
+20 85 36
+21 -35 -54
+22 54 -21
+23 64 -17
+24 55 89
+25 17 -25
+26 -61 66
+27 -61 26
+28 17 -72
+29 79 38
+30 -62 -2
+31 -90 -68
+32 52 66
+33 -54 -50
+34 8 -84
+35 37 -90
+36 -83 49
+37 35 -1
+38 7 59
+39 12 48
+40 57 95
+41 92 28
+42 -3 97
+43 -7 52
+44 42 -15
+45 77 -43
+46 59 -49
+47 25 91
+48 69 -19
+49 -82 -14
+50 74 -70
+51 69 59
+52 29 33
+53 -97 9
+54 -58 9
+55 28 93
+56 7 73
+57 -28 73
+58 -76 55
+59 41 42
+60 92 40
+61 -84 -29
+62 -12 42
+63 51 -45
+64 -37 46
+65 -97 35
+66 14 89
+67 60 58
+68 -63 -75
+69 -18 34
+70 -46 -82
+71 -86 -79
+72 -43 -30
+73 -44 7
+74 -3 -20
+75 36 41
+76 -30 -94
+77 79 -62
+78 51 70
+79 -61 -26
+80 6 94
+81 -19 -62
+82 -20 51
+83 -81 37
+84 7 31
+85 52 12
+86 83 -91
+87 -7 -92
+88 82 -74
+89 -70 85
+90 -83 -30
+91 71 -61
+92 85 11
+93 66 -48
+94 78 -87
+95 9 -79
+96 -36 4
+97 66 39
+98 92 -17
+99 -46 -79
+100 -30 -63
+101 -42 63
+102 20 42
+103 15 98
+104 1 -17
+105 64 20
+106 -96 85
+107 93 -29
+108 -40 -84
+109 86 35
+110 91 36
+111 62 -8
+112 -24 4
+113 11 96
+114 -53 62
+115 -28 -71
+116 7 -4
+117 95 -9
+118 -3 17
+119 53 -90
+120 58 -19
+121 -83 84
+122 -1 49
+123 -4 17
+124 -82 -3
+125 -43 47
+126 6 -6
+127 70 99
+128 68 -29
+129 -94 -30
+130 -94 -20
+131 -21 77
+132 64 37
+133 -70 -19
+134 88 65
+135 2 29
+136 33 57
+137 -70 6
+138 -38 -56
+139 -80 -95
+140 -5 -39
+141 8 -22
+142 -61 -76
+143 76 -22
+144 49 -71
+145 -30 -68
+146 1 34
+147 77 79
+148 -58 64
+149 82 -97
+150 -80 55
+151 81 -86
+152 39 -49
+153 -67 72
+154 -25 -89
+155 -44 -95
+156 32 -68
+157 -17 49
+158 93 49
+159 99 81
+160 10 -49
+161 63 -41
+162 38 39
+163 -28 39
+164 -2 -47
+165 38 8
+166 -42 -6
+167 -67 88
+168 19 93
+169 40 27
+170 -61 56
+171 43 33
+172 -18 -39
+173 -69 19
+174 75 -18
+175 31 85
+176 25 58
+177 -16 36
+178 91 15
+179 60 -39
+180 49 -47
+181 42 33
+182 16 -81
+183 -78 53
+184 53 -80
+185 -46 -26
+186 -25 -54
+187 69 -46
+188 0 -78
+189 -84 74
+190 -16 16
+191 -63 -14
+192 51 -77
+193 -39 61
+194 5 97
+195 -55 39
+196 70 -14
+197 0 95
+198 -45 7
+199 38 -24
+200 50 -37
+201 59 71
+202 -73 -96
+203 -29 72
+204 -47 12
+205 -88 -61
+206 -88 36
+207 -46 -3
+208 26 -37
+209 -39 -67
+210 92 27
+211 -80 -31
+212 93 -50
+213 -20 -5
+214 -22 73
+215 -4 -7
+216 54 -48
+217 -70 39
+218 54 -82
+219 29 41
+220 -87 51
+221 -96 -36
+222 49 8
+223 -5 54
+224 -26 43
+225 -11 60
+226 40 61
+227 82 35
+228 -92 12
+229 -93 -86
+230 -66 63
+231 -72 -87
+232 -57 -84
+233 23 52
+234 -56 -62
+235 -19 59
+236 63 -14
+237 -13 38
+238 -19 87
+239 44 -84
+240 98 -17
+241 -16 62
+242 3 66
+243 26 22
+244 -38 -81
+245 70 80
+246 17 -35
+247 96 -83
+248 -77 80
+249 -14 44
+250 -33 33
+251 33 -33
+252 70 0
+253 -50 60
+254 -50 -60
+255 75 0
+256 0 75
+257 -75 0
+258 0 -75
+259 40 80
+260 40 -80
+261 -60 20
+262 -60 -20
+EOF
diff --git a/datasets/karate_mod.mtx b/datasets/karate_mod.mtx
new file mode 100644
index 00000000000..3a562406800
--- /dev/null
+++ b/datasets/karate_mod.mtx
@@ -0,0 +1,81 @@
+2 1
+3 1
+4 1
+5 1
+6 1
+7 1
+8 1
+9 1
+11 1
+12 1
+13 1
+14 1
+18 1
+20 1
+22 1
+32 1
+3 2
+4 2
+8 2
+14 2
+18 2
+20 2
+22 2
+31 2
+4 3
+8 3
+9 3
+10 3
+14 3
+28 3
+29 3
+33 3
+8 4
+13 4
+14 4
+7 5
+11 5
+7 6
+11 6
+17 6
+17 7
+31 9 
+33 9
+34 9
+34 10
+34 14
+33 15
+34 15
+33 16
+34 16
+33 19
+34 19
+34 20
+33 21
+34 21
+33 23
+34 23
+26 24
+28 24
+30 24
+33 24
+34 24
+26 25
+28 25
+32 25
+32 26
+30 27
+34 27
+34 28
+32 29
+34 29
+33 30
+34 30
+33 31
+34 31
+33 32
+34 32
+34 33
+35
+36
+37
diff --git a/datasets/karate_multi_edge.csv b/datasets/karate_multi_edge.csv
new file mode 100644
index 00000000000..6f331b77a59
--- /dev/null
+++ b/datasets/karate_multi_edge.csv
@@ -0,0 +1,160 @@
+1 0 1.0
+2 0 1.0
+3 0 1.0
+4 0 1.0
+5 0 1.0
+6 0 1.0
+7 0 1.0
+8 0 1.0
+10 0 1.0
+11 0 1.0
+12 0 1.0
+13 0 1.0
+17 0 1.0
+19 0 1.0
+21 0 1.0
+31 0 1.0
+2 1 1.0
+3 1 1.0
+7 1 1.0
+13 1 1.0
+7 0 1.0
+17 1 1.0
+19 1 1.0
+21 1 1.0
+30 1 1.0
+3 2 1.0
+7 2 1.0
+8 2 1.0
+9 2 1.0
+13 2 1.0
+27 2 1.0
+28 2 1.0
+32 2 1.0
+7 3 1.0
+12 3 1.0
+13 3 1.0
+6 4 1.0
+10 4 1.0
+6 5 1.0
+10 5 1.0
+16 5 1.0
+16 6 1.0
+30 8 1.0
+32 8 1.0
+33 8 1.0
+28 2 1.0
+33 9 1.0
+33 13 1.0
+32 14 1.0
+33 14 1.0
+32 15 1.0
+33 15 1.0
+32 18 1.0
+33 18 1.0
+33 19 1.0
+32 20 1.0
+33 20 1.0
+32 22 1.0
+33 22 1.0
+25 23 1.0
+27 23 1.0
+29 23 1.0
+32 23 1.0
+33 23 1.0
+25 24 1.0
+27 24 1.0
+31 24 1.0
+31 25 1.0
+29 26 1.0
+33 26 1.0
+33 27 1.0
+31 28 1.0
+33 28 1.0
+32 29 1.0
+33 29 1.0
+32 22 1.0
+32 30 1.0
+33 30 1.0
+32 31 1.0
+33 31 1.0
+33 32 1.0
+0 1 1.0
+0 2 1.0
+0 3 1.0
+0 4 1.0
+0 5 1.0
+0 6 1.0
+0 7 1.0
+0 8 1.0
+0 10 1.0
+0 11 1.0
+0 12 1.0
+0 6 1.0
+0 13 1.0
+0 17 1.0
+0 19 1.0
+0 21 1.0
+0 31 1.0
+1 2 1.0
+1 3 1.0
+1 7 1.0
+1 13 1.0
+1 17 1.0
+1 19 1.0
+1 21 1.0
+1 30 1.0
+2 3 1.0
+2 7 1.0
+2 8 1.0
+2 9 1.0
+2 13 1.0
+2 27 1.0
+2 28 1.0
+2 32 1.0
+3 7 1.0
+3 12 1.0
+3 13 1.0
+4 6 1.0
+4 10 1.0
+5 6 1.0
+5 10 1.0
+5 16 1.0
+6 16 1.0
+8 30 1.0
+8 32 1.0
+8 33 1.0
+9 33 1.0
+13 33 1.0
+14 32 1.0
+14 33 1.0
+15 32 1.0
+15 33 1.0
+18 32 1.0
+18 33 1.0
+19 33 1.0
+20 32 1.0
+20 33 1.0
+22 32 1.0
+22 33 1.0
+23 25 1.0
+23 27 1.0
+23 29 1.0
+23 32 1.0
+23 33 1.0
+24 25 1.0
+24 27 1.0
+24 31 1.0
+25 31 1.0
+26 29 1.0
+26 33 1.0
+27 33 1.0
+28 31 1.0
+28 33 1.0
+29 32 1.0
+29 33 1.0
+30 32 1.0
+30 33 1.0
+31 32 1.0
+31 33 1.0
+32 33 1.0
diff --git a/datasets/karate_s_loop.csv b/datasets/karate_s_loop.csv
new file mode 100644
index 00000000000..3959e5f98b3
--- /dev/null
+++ b/datasets/karate_s_loop.csv
@@ -0,0 +1,160 @@
+1 0 1.0
+2 0 1.0
+3 0 1.0
+4 0 1.0
+5 0 1.0
+6 0 1.0
+7 0 1.0
+8 0 1.0
+10 0 1.0
+11 0 1.0
+12 0 1.0
+13 0 1.0
+17 0 1.0
+19 0 1.0
+21 0 1.0
+31 0 1.0
+2 1 1.0
+3 1 1.0
+7 1 1.0
+13 1 1.0
+17 1 1.0
+19 1 1.0
+21 1 1.0
+30 1 1.0
+3 2 1.0
+7 2 1.0
+8 2 1.0
+9 2 1.0
+13 2 1.0
+27 2 1.0
+28 2 1.0
+32 2 1.0
+7 3 1.0
+12 3 1.0
+13 3 1.0
+6 4 1.0
+10 4 1.0
+6 5 1.0
+10 5 1.0
+10 10 1.0
+16 5 1.0
+16 6 1.0
+30 8 1.0
+32 8 1.0
+33 8 1.0
+33 9 1.0
+33 13 1.0
+32 14 1.0
+33 14 1.0
+32 15 1.0
+33 15 1.0
+32 18 1.0
+33 18 1.0
+33 19 1.0
+32 20 1.0
+33 20 1.0
+32 22 1.0
+33 22 1.0
+25 23 1.0
+27 23 1.0
+29 23 1.0
+32 23 1.0
+33 23 1.0
+25 24 1.0
+27 24 1.0
+31 24 1.0
+31 25 1.0
+29 26 1.0
+33 26 1.0
+33 27 1.0
+31 28 1.0
+33 28 1.0
+32 29 1.0
+33 29 1.0
+32 30 1.0
+33 30 1.0
+32 31 1.0
+33 31 1.0
+33 32 1.0
+0 1 1.0
+0 2 1.0
+0 3 1.0
+0 4 1.0
+0 5 1.0
+0 6 1.0
+0 7 1.0
+0 8 1.0
+0 10 1.0
+0 11 1.0
+0 12 1.0
+0 13 1.0
+0 17 1.0
+0 19 1.0
+0 21 1.0
+0 31 1.0
+1 2 1.0
+1 3 1.0
+1 7 1.0
+1 13 1.0
+1 1 1.0
+1 17 1.0
+1 19 1.0
+1 21 1.0
+1 30 1.0
+2 3 1.0
+2 7 1.0
+2 8 1.0
+2 9 1.0
+2 13 1.0
+2 27 1.0
+2 28 1.0
+2 32 1.0
+3 7 1.0
+3 12 1.0
+3 13 1.0
+4 6 1.0
+4 10 1.0
+5 6 1.0
+5 10 1.0
+5 16 1.0
+6 16 1.0
+8 30 1.0
+8 32 1.0
+8 33 1.0
+9 33 1.0
+13 33 1.0
+13 13 1.0
+14 32 1.0
+14 33 1.0
+15 32 1.0
+15 33 1.0
+18 32 1.0
+18 33 1.0
+19 33 1.0
+20 32 1.0
+20 33 1.0
+22 32 1.0
+22 33 1.0
+23 25 1.0
+23 27 1.0
+23 29 1.0
+23 32 1.0
+23 33 1.0
+24 25 1.0
+24 27 1.0
+24 31 1.0
+25 31 1.0
+26 29 1.0
+26 33 1.0
+27 33 1.0
+28 31 1.0
+28 33 1.0
+29 32 1.0
+29 33 1.0
+30 32 1.0
+30 33 1.0
+31 32 1.0
+31 31 1.0
+31 33 1.0
+32 33 1.0
diff --git a/datasets/karate_str.mtx b/datasets/karate_str.mtx
new file mode 100644
index 00000000000..0564d30f91d
--- /dev/null
+++ b/datasets/karate_str.mtx
@@ -0,0 +1,78 @@
+9q a9 1
+ts a9 1
+kt a9 1
+j7 a9 1
+wr a9 1
+n3 a9 1
+2w a9 1
+8a a9 1
+ci a9 1
+cq a9 1
+ca a9 1
+gd a9 1
+y4 a9 1
+kx a9 1 
+u3 a9 1
+id a9 1
+ts 9q 1
+kt 9q 1
+2w 9q 1
+gd 9q 1
+y4 9q 1
+kx 9q 1
+u3 9q 1
+7p 9q 1
+kt ts 1
+2w ts 1
+8a ts 1
+ax ts 1
+gd ts 1
+84 ts 1
+ar ts 1
+05 ts 1
+2w kt 1
+ca kt 1
+gd kt 1
+n3 j7 1
+ci j7 1
+n3 wr 1
+ci wr 1
+27 wr 1
+27 n3 1
+7p 8a 1
+05 8a 1
+ux 8a 1
+ux ax 1
+ux gd 1
+05 r9 1
+ux r9 1
+05 44 1
+ux 44 1
+05 a6 1
+ux a6 1
+ux kx 1
+05 d5 1
+ux d5 1
+05 gk 1
+ux gk 1
+fo em 1
+84 em 1
+wc em 1
+05 em 1
+ux em 1
+fo 1j 1
+84 1j 1
+id 1j 1
+id fo 1
+wc nm 1
+ux nm 1
+ux 84 1
+id ar 1
+ux ar 1
+05 wc 1
+ux wc 1
+05 7p 1
+ux 7p 1
+05 id 1
+ux id 1
+ux 05 1
diff --git a/datasets/kroA100.tsp b/datasets/kroA100.tsp
new file mode 100644
index 00000000000..05ebae994ac
--- /dev/null
+++ b/datasets/kroA100.tsp
@@ -0,0 +1,107 @@
+NAME: kroA100
+TYPE: TSP
+COMMENT: 100-city problem A (Krolak/Felts/Nelson)
+DIMENSION: 100
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 1380 939
+2 2848 96
+3 3510 1671
+4 457 334
+5 3888 666
+6 984 965
+7 2721 1482
+8 1286 525
+9 2716 1432
+10 738 1325
+11 1251 1832
+12 2728 1698
+13 3815 169
+14 3683 1533
+15 1247 1945
+16 123 862
+17 1234 1946
+18 252 1240
+19 611 673
+20 2576 1676
+21 928 1700
+22 53 857
+23 1807 1711
+24 274 1420
+25 2574 946
+26 178 24
+27 2678 1825
+28 1795 962
+29 3384 1498
+30 3520 1079
+31 1256 61
+32 1424 1728
+33 3913 192
+34 3085 1528
+35 2573 1969
+36 463 1670
+37 3875 598
+38 298 1513
+39 3479 821
+40 2542 236
+41 3955 1743
+42 1323 280
+43 3447 1830
+44 2936 337
+45 1621 1830
+46 3373 1646
+47 1393 1368
+48 3874 1318
+49 938 955
+50 3022 474
+51 2482 1183
+52 3854 923
+53 376 825
+54 2519 135
+55 2945 1622
+56 953 268
+57 2628 1479
+58 2097 981
+59 890 1846
+60 2139 1806
+61 2421 1007
+62 2290 1810
+63 1115 1052
+64 2588 302
+65 327 265
+66 241 341
+67 1917 687
+68 2991 792
+69 2573 599
+70 19 674
+71 3911 1673
+72 872 1559
+73 2863 558
+74 929 1766
+75 839 620
+76 3893 102
+77 2178 1619
+78 3822 899
+79 378 1048
+80 1178 100
+81 2599 901
+82 3416 143
+83 2961 1605
+84 611 1384
+85 3113 885
+86 2597 1830
+87 2586 1286
+88 161 906
+89 1429 134
+90 742 1025
+91 1625 1651
+92 1187 706
+93 1787 1009
+94 22 987
+95 3640 43
+96 3756 882
+97 776 392
+98 1724 1642
+99 198 1810
+100 3950 1558
+EOF
diff --git a/datasets/tsp225.tsp b/datasets/tsp225.tsp
new file mode 100644
index 00000000000..ac9e06cecc1
--- /dev/null
+++ b/datasets/tsp225.tsp
@@ -0,0 +1,232 @@
+NAME : tsp225
+COMMENT : A TSP problem (Reinelt)
+TYPE : TSP
+DIMENSION : 225
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 155.42 150.65
+2 375.92 164.65
+3 183.92 150.65
+4 205.42 150.65
+5 205.42 171.65
+6 226.42 171.65
+7 226.42 186.15
+8 226.42 207.15
+9 226.42 235.65
+10 226.42 264.15
+11 226.42 292.65
+12 226.42 314.15
+13 226.42 335.65
+14 205.42 335.65
+15 190.92 335.65
+16 190.92 328.15
+17 176.92 328.15
+18 176.92 299.65
+19 155.42 299.65
+20 155.42 328.15
+21 155.42 356.65
+22 183.92 356.65
+23 219.42 356.65
+24 240.92 356.65
+25 269.42 356.65
+26 290.42 356.65
+27 387.42 136.15
+28 318.92 356.65
+29 318.92 335.65
+30 318.92 328.15
+31 318.92 299.65
+32 297.92 299.65
+33 290.42 328.15
+34 290.42 335.65
+35 297.92 328.15
+36 254.92 335.65
+37 254.92 314.15
+38 254.92 292.65
+39 254.92 271.65
+40 254.92 243.15
+41 254.92 221.65
+42 254.92 193.15
+43 254.92 171.65
+44 276.42 171.65
+45 296.42 150.65
+46 276.42 150.65
+47 375.92 150.65
+48 308.92 150.65
+49 354.92 164.65
+50 338.42 174.65
+51 354.92 174.65
+52 338.42 200.15
+53 338.42 221.65
+54 354.92 221.65
+55 354.92 200.15
+56 361.92 200.15
+57 361.92 186.15
+58 383.42 186.15
+59 383.42 179.15
+60 404.42 179.15
+61 404.42 186.15
+62 418.92 186.15
+63 418.92 200.15
+64 432.92 200.15
+65 432.92 221.65
+66 418.92 221.65
+67 418.92 235.65
+68 397.42 235.65
+69 397.42 243.15
+70 375.92 243.15
+71 375.92 257.15
+72 368.92 257.15
+73 368.92 264.15
+74 347.42 264.15
+75 347.42 278.65
+76 336.42 278.65
+77 336.42 328.15
+78 347.42 328.15
+79 347.42 342.65
+80 368.92 342.65
+81 368.92 353.65
+82 418.92 353.65
+83 418.92 342.65
+84 432.92 342.65
+85 432.92 356.65
+86 447.42 356.65
+87 447.42 321.15
+88 447.42 292.65
+89 432.92 292.65
+90 432.92 314.15
+91 418.92 314.15
+92 418.92 321.15
+93 397.42 321.15
+94 397.42 333.65
+95 375.92 333.65
+96 375.92 321.15
+97 361.92 321.15
+98 361.92 299.65
+99 375.92 299.65
+100 375.92 285.65
+101 397.42 285.65
+102 397.42 271.65
+103 418.92 271.65
+104 418.92 264.15
+105 439.92 264.15
+106 439.92 250.15
+107 454.42 250.15
+108 454.42 243.15
+109 461.42 243.15
+110 461.42 214.65
+111 461.42 193.15
+112 447.42 193.15
+113 447.42 179.15
+114 439.92 179.15
+115 439.92 167.65
+116 419.92 167.65
+117 419.92 150.65
+118 439.92 150.65
+119 454.42 150.65
+120 475.92 150.65
+121 475.92 171.65
+122 496.92 171.65
+123 496.92 193.15
+124 496.92 214.65
+125 496.92 243.15
+126 496.92 271.65
+127 496.92 292.65
+128 496.92 317.15
+129 496.92 335.65
+130 470.42 335.65
+131 470.42 356.65
+132 496.92 356.65
+133 347.42 150.65
+134 539.92 356.65
+135 560.92 356.65
+136 589.42 356.65
+137 589.42 342.65
+138 603.92 342.65
+139 610.92 342.65
+140 610.92 335.65
+141 610.92 321.15
+142 624.92 321.15
+143 624.92 278.65
+144 610.92 278.65
+145 610.92 257.15
+146 589.42 257.15
+147 589.42 250.15
+148 575.42 250.15
+149 560.92 250.15
+150 542.92 250.15
+151 542.92 264.15
+152 560.92 264.15
+153 575.42 264.15
+154 575.42 271.65
+155 582.42 271.65
+156 582.42 285.65
+157 596.42 285.65
+158 560.92 335.65
+159 596.42 314.15
+160 582.42 314.15
+161 582.42 321.15
+162 575.42 321.15
+163 575.42 335.65
+164 525.42 335.65
+165 525.42 314.15
+166 525.42 299.65
+167 525.42 281.65
+168 525.42 233.15
+169 525.42 214.65
+170 525.42 193.15
+171 525.42 171.65
+172 546.92 171.65
+173 546.92 150.65
+174 568.42 150.65
+175 475.92 160.65
+176 603.92 150.65
+177 624.92 150.65
+178 624.92 136.15
+179 596.42 136.15
+180 575.42 136.15
+181 553.92 136.15
+182 532.42 136.15
+183 575.42 356.65
+184 489.92 136.15
+185 468.42 136.15
+186 447.42 136.15
+187 425.92 136.15
+188 404.42 136.15
+189 370.42 136.15
+190 361.92 150.65
+191 340.42 136.15
+192 326.42 136.15
+193 301.92 136.15
+194 276.42 136.15
+195 254.92 136.15
+196 315.92 136.15
+197 212.42 136.15
+198 190.92 136.15
+199 338.92 150.65
+200 155.42 136.15
+201 624.92 299.65
+202 318.92 321.65
+203 155.42 314.15
+204 311.92 356.65
+205 355.42 136.15
+206 318.92 314.15
+207 362.92 164.65
+208 254.92 356.65
+209 383.42 333.65
+210 447.42 335.65
+211 470.42 345.65
+212 525.42 250.15
+213 546.92 335.65
+214 525.42 261.15
+215 525.42 356.65
+216 336.42 298.65
+217 336.42 313.15
+218 293.42 136.15
+219 336.42 306.15
+220 425.92 264.15
+221 391.42 353.65
+222 482.92 335.65
+223 429.92 167.65
+224 330.92 150.65
+225 368.42 150.65
+EOF
diff --git a/docs/Makefile b/docs/cugraph/Makefile
similarity index 100%
rename from docs/Makefile
rename to docs/cugraph/Makefile
diff --git a/docs/README.md b/docs/cugraph/README.md
similarity index 100%
rename from docs/README.md
rename to docs/cugraph/README.md
diff --git a/docs/make.bat b/docs/cugraph/make.bat
similarity index 100%
rename from docs/make.bat
rename to docs/cugraph/make.bat
diff --git a/docs/requirement.txt b/docs/cugraph/requirement.txt
similarity index 100%
rename from docs/requirement.txt
rename to docs/cugraph/requirement.txt
diff --git a/docs/source/_static/EMPTY b/docs/cugraph/source/_static/EMPTY
similarity index 100%
rename from docs/source/_static/EMPTY
rename to docs/cugraph/source/_static/EMPTY
diff --git a/docs/source/_static/copybutton.css b/docs/cugraph/source/_static/copybutton.css
similarity index 100%
rename from docs/source/_static/copybutton.css
rename to docs/cugraph/source/_static/copybutton.css
diff --git a/docs/cugraph/source/_static/copybutton_pydocs.js b/docs/cugraph/source/_static/copybutton_pydocs.js
new file mode 100644
index 00000000000..cec05777e6b
--- /dev/null
+++ b/docs/cugraph/source/_static/copybutton_pydocs.js
@@ -0,0 +1,65 @@
+$(document).ready(function() {
+    /* Add a [>>>] button on the top-right corner of code samples to hide
+     * the >>> and ... prompts and the output and thus make the code
+     * copyable. */
+    var div = $('.highlight-python .highlight,' +
+                '.highlight-python3 .highlight,' +
+                '.highlight-pycon .highlight,' +
+                '.highlight-default .highlight');
+    var pre = div.find('pre');
+
+    // get the styles from the current theme
+    pre.parent().parent().css('position', 'relative');
+    var hide_text = 'Hide the prompts and output';
+    var show_text = 'Show the prompts and output';
+    var border_width = pre.css('border-top-width');
+    var border_style = pre.css('border-top-style');
+    var border_color = pre.css('border-top-color');
+    var button_styles = {
+        'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
+        'border-color': border_color, 'border-style': border_style,
+        'border-width': border_width, 'text-size': '75%',
+        'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '1.5em',
+        'border-radius': '0 3px 0 0',
+        'transition': "0.5s"
+    }
+
+    // create and add the button to all the code blocks that contain >>>
+    div.each(function(index) {
+        var jthis = $(this);
+        if (jthis.find('.gp').length > 0) {
+            var button = $('<span id="strike_button" class="copybutton">&gt;&gt;&gt;</span>');
+            button.css(button_styles)
+            button.attr('title', hide_text);
+            button.data('hidden', 'false');
+            jthis.prepend(button);
+        }
+        // tracebacks (.gt) contain bare text elements that need to be
+        // wrapped in a span to work with .nextUntil() (see later)
+        jthis.find('pre:has(.gt)').contents().filter(function() {
+            return ((this.nodeType == 3) && (this.data.trim().length > 0));
+        }).wrap('<span>');
+    });
+
+    // define the behavior of the button when it's clicked
+    $('.copybutton').click(function(e){
+        e.preventDefault();
+        var button = $(this);
+        if (button.data('hidden') === 'false') {
+            // hide the code output
+            button.parent().find('.go, .gp, .gt').hide();
+            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
+            button.css('text-decoration', 'line-through');
+            button.attr('title', show_text);
+            button.data('hidden', 'true');
+        } else {
+            // show the code output
+            button.parent().find('.go, .gp, .gt').show();
+            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
+            button.css('text-decoration', 'none');
+            button.attr('title', hide_text);
+            button.data('hidden', 'false');
+        }
+    });
+});
+
diff --git a/docs/source/_static/example_mod.js b/docs/cugraph/source/_static/example_mod.js
similarity index 100%
rename from docs/source/_static/example_mod.js
rename to docs/cugraph/source/_static/example_mod.js
diff --git a/docs/source/_static/params.css b/docs/cugraph/source/_static/params.css
similarity index 100%
rename from docs/source/_static/params.css
rename to docs/cugraph/source/_static/params.css
diff --git a/docs/source/_static/references.css b/docs/cugraph/source/_static/references.css
similarity index 100%
rename from docs/source/_static/references.css
rename to docs/cugraph/source/_static/references.css
diff --git a/docs/source/api.rst b/docs/cugraph/source/api.rst
similarity index 63%
rename from docs/source/api.rst
rename to docs/cugraph/source/api.rst
index 459e5fbf4f1..81c5e3e4ee2 100644
--- a/docs/source/api.rst
+++ b/docs/cugraph/source/api.rst
@@ -10,7 +10,7 @@ Structure
 Graph
 -----
 
-.. autoclass:: cugraph.structure.graph.Graph
+.. autoclass:: cugraph.structure.graph_classes.Graph
     :members:
     :undoc-members:
 
@@ -40,13 +40,6 @@ Betweenness Centrality
     :members:
     :undoc-members:
 
-Edge Betweenness Centrality
----------------------------
-
-.. automodule:: cugraph.centrality.edge_betweenness_centrality
-    :members:
-    :undoc-members:
-
 Katz Centrality
 ---------------
 
@@ -55,9 +48,23 @@ Katz Centrality
     :undoc-members:
 
 
+Katz Centrality (MG)
+--------------------
+
+.. automodule:: cugraph.dask.centrality.katz_centrality
+    :members:
+    :undoc-members:
+
 Community
 =========
 
+EgoNet
+------------------------------------
+
+.. automodule:: cugraph.community.egonet
+	:members:
+	:undoc-members:
+
 Ensemble clustering for graphs (ECG)
 ------------------------------------
 
@@ -86,6 +93,14 @@ Louvain
     :members:
     :undoc-members:
 
+Louvain (MG)
+------------
+
+.. automodule:: cugraph.dask.community.louvain
+    :members:
+    :undoc-members:
+
+
 Spectral Clustering
 -------------------
 
@@ -118,6 +133,12 @@ Connected Components
     :members:
     :undoc-members:
 
+Connected Components (MG)
+--------------------
+
+.. automodule:: cugraph.dask.components.connectivity
+    :members:
+    :undoc-members:
 
 Cores
 =====
@@ -128,7 +149,7 @@ Core Number
 .. automodule:: cugraph.cores.core_number
     :members:
     :undoc-members:
-    
+
 K-Core
 ------
 
@@ -148,6 +169,17 @@ Force Atlas 2
     :undoc-members:
 
 
+Linear Assignment
+=================
+
+Hungarian
+-------------
+
+.. automodule:: cugraph.linear_assignment.hungarian
+    :members:
+    :undoc-members:
+
+
 Link Analysis
 =============
 
@@ -165,6 +197,13 @@ Pagerank
     :members:
     :undoc-members:
 
+Pagerank (MG)
+-------------
+
+.. automodule:: cugraph.dask.link_analysis.pagerank
+    :members: pagerank
+    :undoc-members:
+
 
 Link Prediction
 ===============
@@ -192,6 +231,17 @@ Overlap Coefficient
     :undoc-members:
 
 
+Sampling
+========
+
+Random Walks
+------------
+
+.. automodule:: cugraph.sampling.random_walks
+    :members:
+    :undoc-members:
+
+
 Traversal
 =========
 
@@ -202,6 +252,13 @@ Breadth-first-search
     :members:
     :undoc-members:
 
+Breadth-first-search (MG)
+-------------------------
+
+.. automodule:: cugraph.dask.traversal.bfs
+    :members:
+    :undoc-members:
+
 Single-source-shortest-path
 ---------------------------
 
@@ -209,6 +266,20 @@ Single-source-shortest-path
     :members:
     :undoc-members:
 
+Single-source-shortest-path (MG)
+--------------------------------
+
+.. automodule:: cugraph.dask.traversal.sssp
+    :members:
+    :undoc-members:
+
+Traveling-salesperson-problem
+-----------------------------
+
+.. automodule:: cugraph.traversal.traveling_salesperson
+    :members:
+    :undoc-members:
+
 
 Tree
 =========
@@ -217,13 +288,37 @@ Minimum Spanning Tree
 ---------------------
 
 .. automodule:: cugraph.tree.minimum_spanning_tree
-    :members:
+    :members: minimum_spanning_tree
     :undoc-members:
 
 Maximum Spanning Tree
 ---------------------
 
-.. automodule:: cugraph.tree.maximum_spanning_tree
-    :members:
+.. automodule:: cugraph.tree.minimum_spanning_tree
+    :members: maximum_spanning_tree
     :undoc-members:
+    :noindex:
 
+
+Generator
+=========
+
+RMAT
+---------------------
+
+.. automodule:: cugraph.generators
+    :members: rmat
+    :undoc-members:
+
+
+DASK MG Helper functions
+===========================
+
+.. automodule:: cugraph.comms.comms
+    :members: initialize, destroy
+    :undoc-members:
+    :member-order: bysource
+
+.. automodule:: cugraph.dask.common.read_utils
+    :members: get_chunksize
+    :undoc-members:
diff --git a/docs/source/conf.py b/docs/cugraph/source/conf.py
similarity index 97%
rename from docs/source/conf.py
rename to docs/cugraph/source/conf.py
index adec59a2f6c..a4633d04f8d 100644
--- a/docs/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2018-2020 NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # pygdf documentation build configuration file, created by
 # sphinx-quickstart on Wed May  3 10:59:22 2017.
@@ -42,17 +42,17 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'numpydoc',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "numpydoc",
+    "sphinx_markdown_tables",
     'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
     'sphinx.ext.linkcode',
     "IPython.sphinxext.ipython_console_highlighting",
     "IPython.sphinxext.ipython_directive",
     "nbsphinx",
     "recommonmark",
-    "sphinx_markdown_tables",
 ]
 
 
@@ -80,9 +80,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.17'
+version = '0.20'
 # The full version, including alpha/beta/rc tags.
-release = '0.17.0'
+release = '0.20.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/cugraph_blogs.rst b/docs/cugraph/source/cugraph_blogs.rst
similarity index 93%
rename from docs/source/cugraph_blogs.rst
rename to docs/cugraph/source/cugraph_blogs.rst
index 84e31d40a19..cbbc93a1b14 100644
--- a/docs/source/cugraph_blogs.rst
+++ b/docs/cugraph/source/cugraph_blogs.rst
@@ -23,6 +23,12 @@ BLOGS
   * `Status of RAPIDS cuGraph — Refactoring Code And Rethinking Graphs <https://medium.com/rapids-ai/status-of-rapids-cugraph-refactoring-code-and-rethinking-graphs-efe9956d5528>`_
   * `Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs <https://medium.com/rapids-ai/tackling-large-graphs-with-rapids-cugraph-and-unified-virtual-memory-b5b69a065d4>`_
   * `RAPIDS cuGraph adds NetworkX and DiGraph Compatibility <https://t.co/6DEhyarVGa>`_
+  * `Large Graph Visualization with RAPIDS cuGraph <https://medium.com/rapids-ai/large-graph-visualization-with-rapids-cugraph-590d07edce33>`_
+
+2021
+------
+   * <soon>
+
 
 
 Media
diff --git a/docs/cugraph/source/cugraph_intro.md b/docs/cugraph/source/cugraph_intro.md
new file mode 100644
index 00000000000..142395fb719
--- /dev/null
+++ b/docs/cugraph/source/cugraph_intro.md
@@ -0,0 +1,69 @@
+
+# cuGraph Introduction
+The Data Scientist has a collection of techniques within their 
+proverbial toolbox. Data engineering, statistical analysis, and 
+machine learning are among the most commonly known. However, there 
+are numerous cases where the focus of the analysis is on the 
+relationship between data elements. In those cases, the data is best 
+represented as a graph. Graph analysis, also called network analysis, 
+is a collection of algorithms for answering questions posed against 
+graph data. Graph analysis is not new.
+
+The first graph problem was posed by Euler in 1736, the [Seven Bridges of 
+Konigsberg](https://en.wikipedia.org/wiki/Seven_Bridges_of_K%C3%B6nigsberg), 
+and laid the foundation for the mathematical field of graph theory. 
+The application of graph analysis covers a wide variety of fields, including 
+marketing, biology, physics, computer science, sociology, and cyber to name a few.
+
+RAPIDS cuGraph is a library of graph algorithms that seamlessly integrates 
+into the RAPIDS data science ecosystem and allows the data scientist to easily 
+call graph algorithms using data stored in a GPU DataFrame, NetworkX Graphs, or even 
+CuPy or SciPy sparse Matrix.  
+
+
+# Vision
+The vision of RAPIDS cuGraph is to ___make graph analysis ubiquitous to the 
+point that users just think in terms of analysis and not technologies or 
+frameworks___. This is a goal that many of us on the cuGraph team have been 
+working on for almost twenty years. Many of the early attempts focused on 
+solving one problem or using one technique. Those early attempts worked for 
+the initial goal but tended to break as the scope changed (e.g., shifting 
+to solving a dynamic graph problem with a static graph solution). The limiting 
+factors usually came down to compute power, ease-of-use, or choosing a data 
+structure that was not suited for all problems. NVIDIA GPUs, CUDA, and RAPIDS 
+have totally changed the paradigm and the goal of an accelerated unified graph 
+analytic library is now possible.
+
+The compute power of the latest NVIDIA GPUs (RAPIDS supports Pascal and later 
+GPU architectures) make graph analytics 1000x faster on average over NetworkX. 
+Moreover, the internal memory speed within a GPU allows cuGraph to rapidly 
+switch the data structure to best suit the needs of the analytic rather than 
+being restricted to a single data structure. cuGraph is working with several 
+frameworks for both static and dynamic graph data structures so that we always 
+have a solution to any graph problem. Since Python has emerged as the de facto 
+language for data science, allowing interactivity and the ability to run graph 
+analytics in Python makes cuGraph familiar and approachable. RAPIDS wraps all 
+the graph analytic goodness mentioned above with the ability to perform 
+high-speed ETL, statistics, and machine learning. To make things even better, 
+RAPIDS and DASK allows cuGraph to scale to multiple GPUs to support 
+multi-billion edge graphs.
+
+
+# Terminology
+
+cuGraph is a collection of GPU accelerated graph algorithms and graph utility
+functions. The application of graph analysis covers a lot of areas.
+For Example:
+* [Network Science](https://en.wikipedia.org/wiki/Network_science)
+* [Complex Network](https://en.wikipedia.org/wiki/Complex_network)
+* [Graph Theory](https://en.wikipedia.org/wiki/Graph_theory)
+* [Social Network Analysis](https://en.wikipedia.org/wiki/Social_network_analysis)
+
+cuGraph does not favor one field over another.  Our developers span the
+breadth of fields with the focus being to produce the best graph library
+possible.  However, each field has its own argot (jargon) for describing the
+graph (or network).  In our documentation, we try to be consistent.  In Python
+documentation we will mostly use the terms __Node__ and __Edge__ to better
+match NetworkX preferred term use, as well as other Python-based tools.  At
+the CUDA/C layer, we favor the mathematical terms of __Vertex__ and __Edge__.  
+
diff --git a/docs/source/cugraph_ref.rst b/docs/cugraph/source/cugraph_ref.rst
similarity index 61%
rename from docs/source/cugraph_ref.rst
rename to docs/cugraph/source/cugraph_ref.rst
index 591619fb338..e0f113eaba4 100644
--- a/docs/source/cugraph_ref.rst
+++ b/docs/cugraph/source/cugraph_ref.rst
@@ -2,22 +2,35 @@
 References
 ##########
 
+************
+Architecture
+************
+
+2-D Data Partitioning
+
+- Kang, S., Fender, A., Eaton, J., & Rees, B. (2020, September) *Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters*. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
+
+
+| 
+
+| 
+
 **********
 Algorithms
 **********
 
 Betweenness Centrality
 
-- Brandes, U. (2001). A faster algorithm for betweenness centrality. Journal of mathematical sociology, 25(2), 163-177.
-- Brandes, U. (2008). On variants of shortest-path betweenness centrality and their generic computation. Social Networks, 30(2), 136-145.
-- McLaughlin, A., & Bader, D. A. (2018). Accelerating GPU betweenness centrality. Communications of the ACM, 61(8), 85-92.
+- Brandes, U. (2001). *A faster algorithm for betweenness centrality*. Journal of mathematical sociology, 25(2), 163-177.
+- Brandes, U. (2008). *On variants of shortest-path betweenness centrality and their generic computation*. Social Networks, 30(2), 136-145.
+- McLaughlin, A., & Bader, D. A. (2018). *Accelerating GPU betweenness centrality*. Communications of the ACM, 61(8), 85-92.
 
 
 Katz
 
 - J. Cohen, *Trusses: Cohesive subgraphs for social network analysis* National security agency technical report, 2008
 - O. Green, J. Fox, E. Kim, F. Busato, et al. *Quickly Finding a Truss in a Haystack* IEEE High Performance Extreme Computing Conference (HPEC), 2017 https://doi.org/10.1109/HPEC.2017.8091038
-- O. Green, P. Yalamanchili, L.M. Munguia, “*ast Triangle Counting on GPU* Irregular Applications: Architectures and Algorithms (IA3), 2014
+- O. Green, P. Yalamanchili, L.M. Munguia, *Fast Triangle Counting on GPU* Irregular Applications: Architectures and Algorithms (IA3), 2014
 
 Hungarian Algorithm
 
@@ -27,6 +40,15 @@ Hungarian Algorithm
 
 | 
 
+*************
+Other Papers
+*************
+- Hricik, T., Bader, D., & Green, O. (2020, September). *Using RAPIDS AI to Accelerate Graph Data Science Workflows*. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
+
+| 
+
+| 
+
 **********
 Data Sets
 **********
diff --git a/docs/cugraph/source/dask-cugraph.rst b/docs/cugraph/source/dask-cugraph.rst
new file mode 100644
index 00000000000..51487bfbf05
--- /dev/null
+++ b/docs/cugraph/source/dask-cugraph.rst
@@ -0,0 +1,53 @@
+~~~~~~~~~~~~~~~~~~~~~~
+Multi-GPU with cuGraph
+~~~~~~~~~~~~~~~~~~~~~~
+
+cuGraph supports multi-GPU leveraging `Dask <https://dask.org>`_. Dask is a flexible library for parallel computing in Python which makes scaling out your workflow smooth and simple. cuGraph also uses other Dask-based RAPIDS projects such as `dask-cuda <https://github.com/rapidsai/dask-cuda>`_. The maximum graph size is currently limited to 2 Billion vertices (to be waived in the next versions).
+
+Distributed graph analytics
+===========================
+
+The current solution is able to scale across multiple GPUs on multiple machines. Distributing the graph and computation lets you analyze datasets far larger than a single GPU’s memory.
+
+With cuGraph and Dask, whether you’re using a single NVIDIA GPU or multiple nodes, your RAPIDS workflow will run smoothly, intelligently distributing the workload across the available resources.
+
+If your graph comfortably fits in memory on a single GPU, you would want to use the single-GPU version of cuGraph. If you want to distribute your workflow across multiple GPUs and have more data than you can fit in memory on a single GPU, you would want to use cuGraph's multi-GPU features.
+
+Example
+========
+
+.. code-block:: python
+
+    from dask.distributed import Client, wait
+    from dask_cuda import LocalCUDACluster
+    import cugraph.comms as Comms
+    import cugraph.dask as dask_cugraph
+
+    cluster = LocalCUDACluster()
+    client = Client(cluster)
+    Comms.initialize(p2p=True)
+
+    # Helper function to set the reader chunk size to automatically get one partition per GPU  
+    chunksize = dask_cugraph.get_chunksize(input_data_path)
+
+    # Multi-GPU CSV reader
+    e_list = dask_cudf.read_csv(input_data_path, 
+            chunksize = chunksize, 
+            delimiter=' ', 
+            names=['src', 'dst'], 
+            dtype=['int32', 'int32'])
+
+    G = cugraph.DiGraph()
+    G.from_dask_cudf_edgelist(e_list, source='src', destination='dst')
+
+    # now run PageRank
+    pr_df = dask_cugraph.pagerank(G, tol=1e-4)
+
+    # All done, clean up
+    Comms.destroy()
+    client.close()
+    cluster.close()
+
+
+|
+
diff --git a/docs/source/images/Nx_Cg_1.png b/docs/cugraph/source/images/Nx_Cg_1.png
similarity index 100%
rename from docs/source/images/Nx_Cg_1.png
rename to docs/cugraph/source/images/Nx_Cg_1.png
diff --git a/docs/source/images/Nx_Cg_2.png b/docs/cugraph/source/images/Nx_Cg_2.png
similarity index 100%
rename from docs/source/images/Nx_Cg_2.png
rename to docs/cugraph/source/images/Nx_Cg_2.png
diff --git a/docs/source/index.rst b/docs/cugraph/source/index.rst
similarity index 100%
rename from docs/source/index.rst
rename to docs/cugraph/source/index.rst
diff --git a/docs/source/nx_transition.rst b/docs/cugraph/source/nx_transition.rst
similarity index 100%
rename from docs/source/nx_transition.rst
rename to docs/cugraph/source/nx_transition.rst
diff --git a/docs/source/sphinxext/github_link.py b/docs/cugraph/source/sphinxext/github_link.py
similarity index 88%
rename from docs/source/sphinxext/github_link.py
rename to docs/cugraph/source/sphinxext/github_link.py
index a7a46fdd9df..fa8fe3f5fe3 100644
--- a/docs/source/sphinxext/github_link.py
+++ b/docs/cugraph/source/sphinxext/github_link.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE:
 # This contains code with copyright by the scikit-learn project, subject to the
 # license in /thirdparty/LICENSES/LICENSE.scikit_learn
 
diff --git a/docs/source/cugraph_intro.md b/docs/source/cugraph_intro.md
deleted file mode 100644
index 5bf2b715462..00000000000
--- a/docs/source/cugraph_intro.md
+++ /dev/null
@@ -1,22 +0,0 @@
-
-# cuGraph Introduction
-
-
-## Terminology
-
-cuGraph is a collection of GPU accelerated graph algorithms and graph utility
-functions. The application of graph analysis covers a lot of areas.
-For Example:
-* [Network Science](https://en.wikipedia.org/wiki/Network_science)
-* [Complex Network](https://en.wikipedia.org/wiki/Complex_network)
-* [Graph Theory](https://en.wikipedia.org/wiki/Graph_theory)
-* [Social Network Analysis](https://en.wikipedia.org/wiki/Social_network_analysis)
-
-cuGraph does not favor one field over another.  Our developers span the
-breadth of fields with the focus being to produce the best graph library
-possible.  However, each field has its own argot (jargon) for describing the
-graph (or network).  In our documentation, we try to be consistent.  In Python
-documentation we will mostly use the terms __Node__ and __Edge__ to better
-match NetworkX preferred term use, as well as other Python-based tools.  At
-the CUDA/C layer, we favor the mathematical terms of __Vertex__ and __Edge__.  
-
diff --git a/docs/source/dask-cugraph.rst b/docs/source/dask-cugraph.rst
deleted file mode 100644
index b27ad382809..00000000000
--- a/docs/source/dask-cugraph.rst
+++ /dev/null
@@ -1,70 +0,0 @@
-~~~~~~~~~~~~~~~~~~~~~~
-Multi-GPU with cuGraph
-~~~~~~~~~~~~~~~~~~~~~~
-
-cuGraph supports multi-GPU leveraging `Dask <https://dask.org>`_. Dask is a flexible library for parallel computing in Python which makes scaling out your workflow smooth and simple. cuGraph also uses other Dask-based RAPIDS projects such as `dask-cuda <https://github.com/rapidsai/dask-cuda>`_. The maximum graph size is currently limited to 2 Billion vertices (to be waived in the next versions).
-
-Distributed graph analytics
-===========================
-
-The current solution is able to scale across multiple GPUs on multiple machines. Distributing the graph and computation lets you analyze datasets far larger than a single GPU’s memory.
-
-With cuGraph and Dask, whether you’re using a single NVIDIA GPU or multiple nodes, your RAPIDS workflow will run smoothly, intelligently distributing the workload across the available resources.
-
-If your graph comfortably fits in memory on a single GPU, you would want to use the single-GPU version of cuGraph. If you want to distribute your workflow across multiple GPUs and have more data than you can fit in memory on a single GPU, you would want to use cuGraph's multi-GPU features.
-
-
-Distributed Graph Algorithms
-----------------------------
-
-.. automodule:: cugraph.dask.link_analysis.pagerank
-    :members: pagerank
-    :undoc-members: 
-
-.. automodule:: cugraph.dask.traversal.bfs
-    :members: bfs
-    :undoc-members: 
-
-
-Helper functions 
-----------------
-
-.. automodule:: cugraph.comms.comms
-    :members: initialize
-    :undoc-members:
-
-.. automodule:: cugraph.comms.comms
-    :members: destroy
-    :undoc-members:
-
-.. automodule:: cugraph.dask.common.read_utils
-    :members: get_chunksize
-    :undoc-members:
-
-Consolidation
-=============
-
-cuGraph can transparently interpret the Dask cuDF Dataframe as a regular Dataframe when loading the edge list. This is particularly helpful for workflows extracting a single GPU sized edge list from a distributed dataset. From there any existing single GPU feature will just work on this input.
-
-For instance, consolidation allows leveraging Dask cuDF CSV reader to load file(s) on multiple GPUs and consolidate this input to a single GPU graph. Reading is often the time and memory bottleneck, with this feature users can call the Multi-GPU version of the reader without changing anything else. 
-
-Batch Processing
-================
-
-cuGraph can leverage multi GPUs to increase processing speed for graphs that fit on a single GPU, providing faster analytics on such graphs.
-You will be able to use the Graph the same way as you used to in a Single GPU environment, but analytics that support batch processing will automatically use the GPUs available to the dask client.
-For example, Betweenness Centrality scores can be slow to obtain depending on the number of vertices used in the approximation. Thank to Multi GPUs Batch Processing,
-you can create Single GPU graph as you would regularly do it using cuDF CSV reader, enable Batch analytics on it, and obtain scores much faster as each GPU will handle a sub-set of the sources.
-In order to use Batch Analytics you need to set up a Dask Cluster and Client in addition to the cuGraph communicator, then you can simply call `enable_batch()` on you graph, and algorithms supporting batch processing will use multiple GPUs.
-
-Algorithms supporting Batch Processing
---------------------------------------
-.. automodule:: cugraph.centrality
-    :members: betweenness_centrality
-    :undoc-members:
-    :noindex:
-
-.. automodule:: cugraph.centrality
-    :members: edge_betweenness_centrality
-    :undoc-members:
-    :noindex:
diff --git a/github/workflows/labeler.yml b/github/workflows/labeler.yml
new file mode 100644
index 00000000000..23956a02fbd
--- /dev/null
+++ b/github/workflows/labeler.yml
@@ -0,0 +1,11 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  triage:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@main
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/notebooks/README.md b/notebooks/README.md
index a5706720235..3769ceb6957 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -10,6 +10,7 @@ This repository contains a collection of Jupyter Notebooks that outline how to r
 | Folder          | Notebook                                                     | Description                                                  |
 | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | Centrality      |                                                              |                                                              |
+|                 | [Centrality](centrality/Centrality.ipynb)                    | Compute and compare multiple centrality scores               |
 |                 | [Katz](centrality/Katz.ipynb)                                | Compute the Katz centrality for every vertex                 |
 |                 | [Betweenness](centrality/Betweenness.ipynb)                  | Compute both Edge and Vertex Betweenness centrality          |
 | Community       |                                                              |                                                              |
@@ -33,6 +34,8 @@ This repository contains a collection of Jupyter Notebooks that outline how to r
 | Traversal       |                                                              |                                                              |
 |                 | [BFS](traversal/BFS.ipynb)                                   | Compute the Breadth First Search path from a starting vertex to every other vertex in a graph |
 |                 | [SSSP](traversal/SSSP.ipynb)                                 | Single Source Shortest Path  - compute the shortest path from a starting vertex to every other vertex |
+| Sampling        |
+|                 | [Random Walk](sampling/RandomWalk.ipynb)                     | Compute Random Walk for a various number of seeds and path lengths |
 | Structure       |                                                              |                                                              |
 |                 | [Renumbering](structure/Renumber.ipynb) <br> [Renumbering 2](structure/Renumber-2.ipynb) | Renumber the vertex IDs in a graph (two sample notebooks)    |
 |                 | [Symmetrize](structure/Symmetrize.ipynb)                     | Symmetrize the edges in a graph                              |
@@ -49,22 +52,21 @@ Running the example in these notebooks requires:
   * Download via Docker, Conda (See [__Getting Started__](https://rapids.ai/start.html))
   
 * cuGraph is dependent on the latest version of cuDF.  Please install all components of RAPIDS 
-* Python 3.6+
+* Python 3.7+
 * A system with an NVIDIA GPU:  Pascal architecture or better
-* CUDA 9.2+
-* NVIDIA driver 396.44+
+* CUDA 11.0+
+* NVIDIA driver 450.51+
 
 
 
 #### Notebook Credits
 
 - Original Authors: Bradley Rees
-- Last Edit: 04/24/2020
+- Last Edit: 04/19/2021
 
-RAPIDS Versions: 0.14    
+RAPIDS Versions: 0.19  
 
 Test Hardware
-
 - GV100 32G, CUDA 9,2
 
 
diff --git a/notebooks/centrality/Centrality.ipynb b/notebooks/centrality/Centrality.ipynb
new file mode 100644
index 00000000000..591c27419ba
--- /dev/null
+++ b/notebooks/centrality/Centrality.ipynb
@@ -0,0 +1,443 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Centrality"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook, we will compute vertex centrality scores using the various cuGraph algorithms.  We will then compare the similarities and differences.\n",
+    "\n",
+    "| Author Credit |    Date    |  Update      | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|--------------|-----------------|----------------|\n",
+    "| Brad Rees     | 04/16/2021 | created      | 0.19            | GV100, CUDA 11.0\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Centrality is measure of how important, or central, a node or edge is within a graph.  It is useful for identifying influencer in social networks, key routing nodes in communication/computer network infrastructures, \n",
+    "\n",
+    "The seminal paper on centrality is:  Freeman, L. C. (1978). Centrality in social networks conceptual clarification. Social networks, 1(3), 215-239.\n",
+    "\n",
+    "\n",
+    "__Degree centrality – done but need new API__ <br>\n",
+    "Degree centrality is based on the notion that whoever has the most connects must be important.   \n",
+    "\n",
+    "<center>\n",
+    "    Cd(v) = degree(v)\n",
+    "</center>\n",
+    "\n",
+    "cuGraph currently does not have a Degree Centrality function call. However, since Degree Centrality is just the degree of a node, we can use _G.degree()_ function.\n",
+    "Degree Centrality for a Directed graph can be further divided in _indegree centrality_ and _outdegree centrality_ and can be obtained using _G.degrees()_\n",
+    "\n",
+    "\n",
+    "__Closeness centrality – coming soon__ <br>\n",
+    "Closeness is a measure of the shortest path to every other node in the graph.  A node that is close to every other node, can reach over other node in the fewest number of hops, means that it has greater influence on the network versus a node that is not close.\n",
+    "\n",
+    "__Betweenness Centrality__ <br>\n",
+    "Betweenness is a measure of the number of shortest paths that cross through a node, or over an edge.  A node with high betweenness means that it had a greater influence on the flow of information.  \n",
+    "\n",
+    "Betweenness centrality of a node 𝑣 is the sum of the fraction of all-pairs shortest paths that pass through 𝑣\n",
+    "\n",
+    "<center>\n",
+    "    <img src=\"https://latex.codecogs.com/png.latex?c_B(v)&space;=\\sum_{s,t&space;\\in&space;V}&space;\\frac{\\sigma(s,&space;t|v)}{\\sigma(s,&space;t)}\" title=\"c_B(v) =\\sum_{s,t \\in V} \\frac{\\sigma(s, t|v)}{\\sigma(s, t)}\" />\n",
+    "</center>\n",
+    "\n",
+    "To speedup runtime of betweenness centrailty, the metric can be computed on a limited number of nodes (randomly selected) and then used to estimate the other scores.  For this example, the graphs are relatively smalled (under 5,000 nodes) so betweenness on every node will be computed.\n",
+    "\n",
+    "__Eigenvector Centrality - coming soon__ <br>\n",
+    "Eigenvectors can be thought of as the balancing points of a graph, or center of gravity of a 3D object.  High centrality means that more of the graph is balanced around that node.\n",
+    "\n",
+    "__Katz Centrality__ <br>\n",
+    "Katz is a variant of degree centrality and of eigenvector centrality. \n",
+    "Katz centrality is a measure of the relative importance of a node within the graph based on measuring the influence across the total number of walks between vertex pairs. \n",
+    "\n",
+    "<center>\n",
+    "    <img src=\"https://latex.codecogs.com/gif.latex?C_{katz}(i)&space;=&space;\\sum_{k=1}^{\\infty}&space;\\sum_{j=1}^{n}&space;\\alpha&space;^k(A^k)_{ji}\" title=\"C_{katz}(i) = \\sum_{k=1}^{\\infty} \\sum_{j=1}^{n} \\alpha ^k(A^k)_{ji}\" />\n",
+    "</center>\n",
+    "\n",
+    "See:\n",
+    "* [Katz on Wikipedia](https://en.wikipedia.org/wiki/Katz_centrality) for more details on the algorithm.\n",
+    "* https://www.sci.unich.it/~francesc/teaching/network/katz.html\n",
+    "\n",
+    "__PageRank Centrality__ <br>\n",
+    "PageRank is classified as both a Link Analysis tool and a centrality measure.  PageRank is based on the assumption that important nodes point (directed edge) to other important nodes.  From a social network perspective, the question is who do you seek for an answer and then who does that person seek.  PageRank is good when there is implied importance in the data, for example a citation network, web page linkages, or trust networks.    \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test Data\n",
+    "We will be using the Zachary Karate club dataset \n",
+    "*W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of\n",
+    "Anthropological Research 33, 452-473 (1977).*\n",
+    "\n",
+    "\n",
+    "![Karate Club](../img/zachary_black_lines.png)\n",
+    "\n",
+    "\n",
+    "Because the test data has vertex IDs starting at 1, the auto-renumber feature of cuGraph (mentioned above) will be used so the starting vertex ID is zero for maximum efficiency. The resulting data will then be auto-unrenumbered, making the entire renumbering process transparent to users."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  Import the modules\n",
+    "import cugraph\n",
+    "import cudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd   \n",
+    "from IPython.display import display_html "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functions\n",
+    "using underscore variable names to avoid collisions.  \n",
+    "non-underscore names are expected to be global names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute Centrality\n",
+    "# the centrality calls are very straight forward with the graph being the primary argument\n",
+    "# we are using the default argument values for all centrality functions\n",
+    "\n",
+    "def compute_centrality(_graph) :\n",
+    "    # Compute Degree Centrality\n",
+    "    _d = _graph.degree()\n",
+    "        \n",
+    "    # Compute the Betweenness Centrality\n",
+    "    _b = cugraph.betweenness_centrality(_graph)\n",
+    "\n",
+    "    # Compute Katz Centrality\n",
+    "    _k = cugraph.katz_centrality(_graph)\n",
+    "    \n",
+    "    # Compute PageRank Centrality\n",
+    "    _p = cugraph.pagerank(_graph)\n",
+    "    \n",
+    "    return _d, _b, _k, _p"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print function\n",
+    "# being lazy and requiring that the dataframe names are not changed versus passing them in\n",
+    "def print_centrality(_n):\n",
+    "    dc_top = dc.sort_values(by='degree', ascending=False).head(_n).to_pandas()\n",
+    "    bc_top = bc.sort_values(by='betweenness_centrality', ascending=False).head(_n).to_pandas()\n",
+    "    katz_top = katz.sort_values(by='katz_centrality', ascending=False).head(_n).to_pandas()\n",
+    "    pr_top = pr.sort_values(by='pagerank', ascending=False).head(_n).to_pandas()\n",
+    "    \n",
+    "    df1_styler = dc_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Degree').hide_index()\n",
+    "    df2_styler = bc_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Betweenness').hide_index()\n",
+    "    df3_styler = katz_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Katz').hide_index()\n",
+    "    df4_styler = pr_top.style.set_table_attributes(\"style='display:inline'\").set_caption('PageRank').hide_index()\n",
+    "\n",
+    "    display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_()+df4_styler._repr_html_(), raw=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the path to the test data  \n",
+    "datafile='../data/karate-data.csv'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "cuGraph does not do any data reading or writing and is dependent on other tools for that, with cuDF being the preferred solution.   \n",
+    "\n",
+    "The data file contains an edge list, which represents the connection of a vertex to another.  The `source` to `destination` pairs is in what is known as Coordinate Format (COO).  In this test case, the data is just two columns.  However a third, `weight`, column is also possible"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gdf = cudf.read_csv(datafile, delimiter='\\t', names=['src', 'dst'], dtype=['int32', 'int32'] )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "it was that easy to load data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a Graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a Graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
+    "G = cugraph.Graph()\n",
+    "G.from_cudf_edgelist(gdf, source='src', destination='dst')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute Centrality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dc, bc, katz, pr = compute_centrality(G)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Results\n",
+    "Typically, analyst look just at the top 10% of results.  Basically just those vertices that are the most central or important.  \n",
+    "The karate data has 32 vertices, so let's round a little and look at the top 5 vertices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_centrality(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A Different Dataset\n",
+    "The Karate dataset is not that large or complex, which makes it a perfect test dataset since it is easy to visually verify results.  Let's look at a larger dataset with a lot more edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the path to the test data  \n",
+    "datafile='../data/netscience.csv'\n",
+    "\n",
+    "gdf = cudf.read_csv(datafile, delimiter=' ', names=['src', 'dst', 'wt'], dtype=['int32', 'int32', 'float'] )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a Graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
+    "G = cugraph.Graph()\n",
+    "G.from_cudf_edgelist(gdf, source='src', destination='dst')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(G.number_of_nodes(), G.number_of_edges())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dc, bc, katz, pr = compute_centrality(G)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_centrality(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now see a larger discrepancy between the centrality scores and which nodes rank highest.\n",
+    "Which centrality measure to use is left to the analyst to decide and does require insight into the difference algorithms and graph structure."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### And One More Dataset\n",
+    "Let's look at a Cyber dataset.  The vertex ID are IP addresses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the path to the test data  \n",
+    "datafile='../data/cyber.csv'\n",
+    "\n",
+    "gdf = cudf.read_csv(datafile, delimiter=',', names=['idx', 'src', 'dst'], dtype=['int32', 'str', 'str'] )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a Graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
+    "G = cugraph.Graph()\n",
+    "G.from_cudf_edgelist(gdf, source='src', destination='dst')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(G.number_of_nodes(), G.number_of_edges())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dc, bc, katz, pr = compute_centrality(G)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_centrality(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are differences in how each centrality measure ranks the nodes. In some cases, every algorithm returns similar results, and in others, the results are different. Understanding how the centrality measure is computed and what edge represent is key to selecting the right centrality metric."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "----\n",
+    "Copyright (c) 2019-2021, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_dev",
+   "language": "python",
+   "name": "cugraph_dev"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/community/ECG.ipynb b/notebooks/community/ECG.ipynb
index d7595dadb26..4a9eedd3c3a 100644
--- a/notebooks/community/ECG.ipynb
+++ b/notebooks/community/ECG.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Ensemble Clustering for Graphs (ECG)\n",
+    "# Does not run on Pascal\n",
     "In this notebook, we will use cuGraph to identify the cluster in a test graph using the Ensemble Clustering for Graph approach.  \n",
     "\n",
     "\n",
diff --git a/notebooks/community/Louvain.ipynb b/notebooks/community/Louvain.ipynb
index e5e5e6a04ed..bfb8e299f49 100755
--- a/notebooks/community/Louvain.ipynb
+++ b/notebooks/community/Louvain.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Louvain Community Detection\n",
+    "# Does not run on Pascal\n",
     "\n",
     "\n",
     "In this notebook, we will use cuGraph to identify the cluster in a test graph using the Louvain algorithm  \n",
diff --git a/notebooks/community/Subgraph-Extraction.ipynb b/notebooks/community/Subgraph-Extraction.ipynb
index e068ef53aa5..cac52262d4d 100755
--- a/notebooks/community/Subgraph-Extraction.ipynb
+++ b/notebooks/community/Subgraph-Extraction.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Subgraph Extraction\n",
+    "# Does not run on Pascal\n",
     "\n",
     "In this notebook, we will use cuGraph to extract a subgraph from the test graph.  \n",
     "\n",
diff --git a/notebooks/community/Triangle-Counting.ipynb b/notebooks/community/Triangle-Counting.ipynb
index 09d7906a526..19d3f838fc6 100755
--- a/notebooks/community/Triangle-Counting.ipynb
+++ b/notebooks/community/Triangle-Counting.ipynb
@@ -21,7 +21,7 @@
     "\n",
     "\n",
     "## Introduction\n",
-    "Triancle Counting, as the name implies, finds the number of triangles in a graph.  Triangles are important in computing the clustering Coefficient and can be used for clustering.  \n",
+    "Triangle Counting, as the name implies, finds the number of triangles in a graph.  Triangles are important in computing the clustering Coefficient and can be used for clustering.  \n",
     "\n",
     "\n",
     "To compute the Pagerank scores for a graph in cuGraph we use:<br>\n",
diff --git a/notebooks/cores/ktruss.ipynb b/notebooks/cores/ktruss.ipynb
index e6470110666..3f283558f27 100644
--- a/notebooks/cores/ktruss.ipynb
+++ b/notebooks/cores/ktruss.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# K-Truss\n",
+    "# Does not run on CUDA 11.4\n",
     "\n",
     "\n",
     "In this notebook, we will use cuGraph to identify the K-Truss clusters in a test graph  \n",
@@ -12,18 +13,13 @@
     "Notebook Credits\n",
     "* Original Authors: Bradley Rees\n",
     "* Created:   10/28/2019\n",
-    "* Last Edit: 08/16/2020\n",
-    "\n",
-    "RAPIDS Versions: 0.13\n",
-    "\n",
-    "Test Hardware\n",
-    "* GV100 32G, CUDA 10.2\n",
-    "\n",
+    "* Last Edit: 08/13/2021\n",
     "\n",
     "\n",
     "## Introduction\n",
     "\n",
     "Compute the k-truss of the graph G.  A K-Truss is a relaxed cliques where every vertex is supported by at least k-2 triangle.\n",
+    "NOTE: k-truss is currently not supported on CUDA 11.4 systems.\n",
     "\n",
     "Ref:\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
index 58eb94bf0ee..6ae695e206e 100644
--- a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# BFS Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running BFS within cuGraph against NetworkX. \n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
index a12b7c4bcc2..00e99a28617 100644
--- a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Louvain Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance improvement of running the Louvain clustering algorithm within cuGraph against NetworkX. The test is run over eight test networks (graphs) and then results plotted.  \n",
     "<p><p>\n",
diff --git a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
index 6f76868f9a4..403c317ac0a 100644
--- a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
+++ b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Benchmarking NetworkX compatibility\n",
+    "# Skip notebook test\n",
     "This notebook benchmark the use of a NetworkX Graph object as input into algorithms.  <p>\n",
     "The intention of the feature is to be able to drop cuGraph into existing NetworkX code in spot where performance is not optimal.\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
index c2933a10c7d..d0416efdd87 100644
--- a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# PageRank Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running PageRank within cuGraph against NetworkX. NetworkX contains several implementations of PageRank.  This benchmark will compare cuGraph versus the defaukt Nx implementation as well as the SciPy version\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/random_walk_benchmark.ipynb b/notebooks/cugraph_benchmarks/random_walk_benchmark.ipynb
new file mode 100644
index 00000000000..65cf9fb59eb
--- /dev/null
+++ b/notebooks/cugraph_benchmarks/random_walk_benchmark.ipynb
@@ -0,0 +1,545 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Random Walk Performance\n",
+    "# Skip notebook test¶ \n",
+    "\n",
+    "Random walk performance is governed by the length of the paths to find, the number of seeds, and the size or structure of the graph.\n",
+    "This benchmark will use several test graphs of increasingly larger sizes.  While not even multiples in scale, the four test graphs should give an indication of how well Random Walk performs as data size increases.  \n",
+    "\n",
+    "### Test Data\n",
+    "Users must run the _dataPrep.sh_ script before running this notebook so that the test files are downloaded\n",
+    "\n",
+    "| File Name              | Num of Vertices | Num of Edges |\n",
+    "| ---------------------- | --------------: | -----------: |\n",
+    "| preferentialAttachment |         100,000 |      999,970 |\n",
+    "| dblp-2010              |         326,186 |    1,615,400 |\n",
+    "| coPapersCiteseer       |         434,102 |   32,073,440 |\n",
+    "| as-Skitter             |       1,696,415 |   22,190,596 |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  Import the modules\n",
+    "import cugraph\n",
+    "import cudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# system and other\n",
+    "import gc\n",
+    "import os\n",
+    "import time\n",
+    "import random\n",
+    "\n",
+    "# MTX file reader\n",
+    "from scipy.io import mmread"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try: \n",
+    "    import matplotlib\n",
+    "except ModuleNotFoundError:\n",
+    "    os.system('pip install matplotlib')\n",
+    "\n",
+    "import matplotlib.pyplot as plt; plt.rcdefaults()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test File\n",
+    "data = {\n",
+    "    'preferentialAttachment' : './data/preferentialAttachment.mtx',\n",
+    "    'dblp'                   : './data/dblp-2010.mtx',\n",
+    "    'coPapersCiteseer'       : './data/coPapersCiteseer.mtx',\n",
+    "    'as-Skitter'             : './data/as-Skitter.mtx'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read the data and create a graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data reader - the file format is MTX, so we will use the reader from SciPy\n",
+    "def read_and_create(datafile):\n",
+    "    print('Reading ' + str(datafile) + '...')\n",
+    "    M = mmread(datafile).asfptype()\n",
+    "\n",
+    "    _gdf = cudf.DataFrame()\n",
+    "    _gdf['src'] = M.row\n",
+    "    _gdf['dst'] = M.col\n",
+    "    _gdf['wt'] = 1.0\n",
+    "    \n",
+    "    _g = cugraph.Graph()\n",
+    "    _g.from_cudf_edgelist(_gdf, source='src', destination='dst', edge_attr='wt', renumber=False)\n",
+    "    \n",
+    "    print(\"\\t{:,} nodes, {:,} edges\".format(_g.number_of_nodes(), _g.number_of_edges() ))\n",
+    "    \n",
+    "    return _g"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define the call to RandomWalk\n",
+    "We are only interested in the runtime, so throw away the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_rw(_G, _seeds, _depth):\n",
+    "    t1 = time.time()\n",
+    "    # cugraph.random_walks() returns a 3-tuple, which is being ignored here.\n",
+    "    cugraph.random_walks(_G, _seeds, _depth)\n",
+    "    t2 = time.time() - t1\n",
+    "    return t2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test 1: Runtime versus path depth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading ./data/preferentialAttachment.mtx...\n",
+      "\t100,000 nodes, 499,985 edges\n",
+      "update i\n",
+      "Reading ./data/dblp-2010.mtx...\n",
+      "\t326,183 nodes, 807,700 edges\n",
+      "update i\n",
+      "Reading ./data/coPapersCiteseer.mtx...\n",
+      "\t434,102 nodes, 16,036,720 edges\n",
+      "update i\n",
+      "Reading ./data/as-Skitter.mtx...\n",
+      "\t1,696,415 nodes, 11,095,298 edges\n",
+      "update i\n"
+     ]
+    }
+   ],
+   "source": [
+    "# some parameters\n",
+    "max_depth = 6\n",
+    "num_seeds = 500\n",
+    "\n",
+    "# arrays to capture performance gains\n",
+    "names = []\n",
+    "\n",
+    "# Two dimension data\n",
+    "time_algo_cu = []       # will be two dimensional\n",
+    "\n",
+    "i = 0\n",
+    "for k,v in data.items():\n",
+    "    time_algo_cu.append([])\n",
+    "    \n",
+    "    # Saved the file Name\n",
+    "    names.append(k)\n",
+    "\n",
+    "    # read data\n",
+    "    G = read_and_create(v)\n",
+    "        \n",
+    "    num_nodes = G.number_of_nodes()\n",
+    "    nodes = G.nodes().to_array().tolist()\n",
+    "\n",
+    "    seeds = random.sample(nodes, num_seeds)\n",
+    "\n",
+    "    for j in range (2, max_depth+1) :\n",
+    "        t = run_rw(G, seeds, j)\n",
+    "        time_algo_cu[i].append(t)\n",
+    "\n",
+    "    # update i\n",
+    "    i = i + 1\n",
+    "    print(\"update i\")\n",
+    "    \n",
+    "    del G\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "list"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(nodes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFNCAYAAABIc7ibAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAACCeUlEQVR4nOzdd3hUZfbA8e876b2HHkINIXRCb6F3kd6Lil1XdF3brmtZXd11f7trXZcmCZCELlVBUKQLAQFJoQdCDek9mfL+/piQDRAENJMAns/z8JiZ+957z0xG5vC+556rtNYIIYQQQoiqZajuAIQQQgghfoskCRNCCCGEqAaShAkhhBBCVANJwoQQQgghqoEkYUIIIYQQ1UCSMCGEEEKIaiBJmBD3EaVUkFIqTyllV92x2JpSaoZSakd1x3EnlFLBSimtlLK/g30GKqW+tGFY1ep23xOl1HCl1JKqikuIqiBJmBA2ppRKVkoVliZHl5RSC5RS7pV47H5XH2utz2qt3bXW5so4fmVTSkUopSyl70WuUuqoUuqh29jvjpOXCo5xzXtVFSrpnO8C7193zKufpzyl1Kbrzvl86ecsRyk1XynlVG5bsFLqO6VUgVIq6ediU0rVVUqtUEqlKaWylVJHlFIzfuVr+cW01muBMKVUq+qKQYjKJkmYEFVjuNbaHWgDtAVerd5wqtWF0vfCE3gZmKOUal7NMd2VlFIdAC+t9Z7rNg0vTbbdtdYDyo0fCLwC9AXqAw2Bt8rtFwP8CPgBfwSWK6UCbnL6hUBK6XH8gKnA5V//qn6VGOCxao5BiEojSZgQVUhrfQnYiDUZuzozdK78mPKzJ0qpN5VSS5VSUaUzR/FKqfDSbQuBIGBt6YzIS9fPGCmltiql3lFK7Sods1Yp5aeUWlw6U7JPKRVc7tzNlFLfKKUySmepxlX0OpRS45VScdc997xSak3pz0OUUgmlMZ9XSr1YwXuhtdZfAplAc6XUUKXUj6VxpSil3iw3fFvpf7NKX0eXcuf9h1IqUyl1Wik1+Fa/gwpei0Ep9YpS6qRSKr30/fYt3Xb1/ZyulDpbOiv0x3L7uiilIkvPn1j6OzhXuu2G30+5006u6HgVGAx8fwcvZzowT2sdr7XOBP4CzCiNpynQDnhDa12otV4B/ASMvsmxOgALtNb5WmuT1vpHrfVX5V5759LPVZZS6pBSKqLcNi+l1Dyl1MXS3/87qnSJXCllV/o7S1NKnQKGlj+psi4znyr97JxWSk0ut3nr9eOFuJdJEiZEFVJK1cX6xXriDnZ7AIgFvIE1wCcAWuupwFn+Nyvy95vsPwHrLEYdoBGwG/gC8AUSgTdKY3MDvgGigcDS/T67ySzVWiBEKdWk3HOTSvcFmAc8rrX2AFoA315/gNLkZ2Tp6/oJyAemlT4eCjyplHqwdHjP0v96l77W3aWPOwFHAX/g78A8pZS6yftwM88CDwK9gNpYk8JPrxvTHQjBOsP0Z6VUaOnzbwDBWGec+gNTru5wi9/PzY53vZalr+96i5VSV5RSm5RSrcs9HwYcKvf4EFBDKeVXuu2U1jr3uu1hNzn3HuBTpdQEpVRQ+Q1KqTrAeuAdrJ+jF4EV5WbVFgAmoDHWmd8BwMzSbY8Cw0qfDwfGlDuuG/ARMLj0s9MVOFju1IlAsFLK8yYxC3FPkSRMiKrxpVIqF+vyTiqlic9t2qG13lBa57UQaH2rHa7zhdb6pNY6G/gKOKm13qy1NgHLsH4ZgvWLMVlr/cXVmQ9gBTD2+gNqrQuA1cBEgNJkrBnWJBHAiHV2y1Nrnam1PlBu99pKqSwgDev7MFVrfVRrvVVr/ZPW2qK1Pox16anXLV7bGa31nNL3JhKoBdS4o3cHngD+qLU+p7UuBt4Exqhr68/eKp09OoQ1cbn6OxgH/LX0NZ7DmkDcjpsd73reQO51z03GmvjVB74DNiqlvEu3uQPZ5cZe/dmjgm1Xt3vc5Nxjge3A68BppdRBZV0eBWuyuaH0c2nRWn8DxAFDlFI1gCHArNJZtFTgX1iTerC+Z//WWqdorTOA9647rwVooZRy0Vpf1FrHl9t29b3wRoj7gCRhQlSNB0v/ZR+BNVnxv4N9L5X7uQBwVndWoF6+jqewgsdXLxKoD3QqXV7KKk2UJgM1b3LcaEqTMKyzYF+WJmdgXeIaApxRSn1ffvkQa02Yt9baV2vdRmsdC6CU6qSsReNXlFLZWJOjW71PZe9NuXPf6UUP9YFV5V5zImDm2mTu+t/B1XPUxppYX1X+559zs+NdL5PrkiSt9c7SBK5Aa/0ekAX0KN2ch7XW7qqrP+dWsO3q9uuTvKvnydRav6K1DsP6XhzE+o8JhfU9G3vdZ6U71iS4PuAAXCy37b9YZ1fhxvfsTLlz5gPjsf7uLyql1iulmpUbe/W9yKooZiHuNZKECVGFtNbfY12q+UfpU/mA69XtpXUzNyuUrvCQlRac9Yvx+9IE6eofd631kzcZ/w0QoJRqgzUZu7oUidZ6n9Z6BNYv3i+Bpbdx/misM2n1tNZewOfA1aXFynyd10vBuvxV/nU7a63P38a+F4G65R7Xu277r437MND0FmM0/3uf4rl2Vq01cFlrnV66raFSyuO67eVnmio+gdZpWD+ztbEuP6YAC697z9y01u+XbisG/Mtt8yxN5sD6npV/n65Z6tRab9Ra98ea0CUBc8ptDsU6W5tzq5iFuBdIEiZE1fs30L+0lucY1pmtoUopB+BPgNPP7Xydy1jrkSrDOqCpUmqqUsqh9E+Hm9Uraa2NWJczP8D6xfwNgFLKUSk1WSnlVTomB+sS0614ABla6yKlVEess2tXXSk9xq99rQ5KKedyf+yxJnvvKqXql8YfoJQacZvHWwq8qpTyKa2Teua67b/297OBckuyytoHrlvpe+yslPoD1tnCnaVDooBHlFLNS5co/4Q16UdrfQzrbNYbpfuOBFphXXK+gVLqb0qpFkop+9LE7UngRGlCtwgYrqw9zOxKjxehlKqrtb4IbAL+TynlWVr710gpdfV1LAV+p6wtMHywXs159Zw1lFIjSmvDirHO3pX/7PTCuqQuxH1BkjAhqpjW+grWL8s/l9ZpPQXMBc5jnRk79zO7X+894E+lyz43XIF4h3HlYi2gngBcwLpk9jd+PimMBvoBy0przK6aCiQrpXKwLi1Nrmjn6zwFvF1aO/dnys2elS41vgvsLH2tnW/7hV1rA9Yl2Kt/3gQ+xDoDt6n03HuwFvzfjrex/r5OA5uB5ViTh6t+1e+ntJYuWyl1NR4P4D9YlynPA4OwzuKll47/GusFCt9hvSjgDNfWH07AWgyfibX32JjSz2NFXIFVWJf+TmFdZnyg9DwpwAjgNawJcgrwB/73nTINcAQSSs+1HOvMFlhntjZirYU7AKwsd04D8ALWz18G1qSr/EzsRKxLm0LcF5TWtpzlF0KI3w6l1JPABK31rS4ouJNjDgCe0lo/WFnHvBcppYZjvYijwrYpQtyLJAkTQohfSClVC+ty426gCda2DZ9orf9dnXEJIe4Nv/gWIEIIIXDEujzWAOuyXSzwWXUGJIS4d8hMmBBCCCFENZDCfCGEEEKIaiBJmBBCCCFENbjnasL8/f11cHBwdYchhBBCCHFL+/fvT9NaV9iE+55LwoKDg4mLi6vuMIQQQgghbkkpdeZm22Q5UgghhBCiGkgSJoQQQghRDSQJE0IIIYSoBvdcTVhFjEYj586do6ioqLpDEfchZ2dn6tati4ODQ3WHIoQQ4j5yXyRh586dw8PDg+DgYJRS1R2OuI9orUlPT+fcuXM0aNCgusMRQghxH7kvliOLiorw8/OTBExUOqUUfn5+MssqhBCi0t0XSRggCZiwGflsCSGEsIX7Jgm7123fvp2wsDDatGlDYWGhzc6zYMECLly4UPZ45syZJCQk/Ow+ERER1/RmO3jwIEopvv7667LnkpOTiY6OvmbMhg0bfnGcwcHBpKWl/eL979SvjVcIIYS4U5KEVSGz2XzTbYsXL+bVV1/l4MGDuLi43PJYWmssFssdx3B9EjZ37lyaN29+R8eIiYmhe/fuxMTElD1X2UlYVbvX4hVCCHHvkySskiQnJ9OsWTMmT55MaGgoY8aMoaCggODgYF5++WXatWvHsmXL2LRpE126dKFdu3aMHTuWvLw85s6dy9KlS3n99deZPHkyAB988AEdOnSgVatWvPHGG2XnCAkJYdq0abRo0YKUlJSbjgsNDeXRRx8lLCyMAQMGUFhYyPLly4mLi2Py5MllM27lZ7mefPJJwsPDCQsLKzvW9bTWLFu2jAULFvDNN9+U1Uq98sorbN++nTZt2vC3v/2NP//5zyxZsoQ2bdqwZMkS9u7dS5cuXWjbti1du3bl6NGjgDUxffHFF2nRogWtWrXi448/LjvXxx9/TLt27WjZsiVJSUkAvPnmm0yfPp0ePXpQv359Vq5cyUsvvUTLli0ZNGgQRqMRgP3799OrVy/at2/PwIEDuXjxImCd1Xv55Zfp2LEjTZs2Zfv27ZSUlNwQrxBCiPuX2WJm85nNHLpyqHoD0VrfU3/at2+vr5eQkHDDc1Xt9OnTGtA7duzQWmv90EMP6Q8++EDXr19f/+1vf9Naa33lyhXdo0cPnZeXp7XW+v3339dvvfWW1lrr6dOn62XLlmmttd64caN+9NFHtcVi0WazWQ8dOlR///33+vTp01oppXfv3n3LcXZ2dvrHH3/UWms9duxYvXDhQq211r169dL79u0ri7v84/T0dK211iaTSffq1UsfOnTohjE7duzQffr00VprPXHiRL18+XKttdbfffedHjp0aNlxv/jiC/3000+XPc7OztZGo1FrrfU333yjR40apbXW+rPPPtOjR48u23Y1hvr16+uPPvpIa631p59+qh955BGttdZvvPGG7tatmy4pKdEHDx7ULi4uesOGDVprrR988EG9atUqXVJSort06aJTU1O11lrHxsbqhx56qOy1vPDCC1prrdevX6/79u1bYbzXuxs+Y0IIIX6dvJI8vTB+oR64fKBusaCFfm37azY/JxCnb5LT3BctKsp7a208CRdyKvWYzWt78sbwsFuOq1evHt26dQNgypQpfPTRRwCMHz8egD179pCQkFA2pqSkhC5dutxwnE2bNrFp0ybatm0LQF5eHsePHycoKIj69evTuXPnW45r0KABbdq0AaB9+/YkJyffMv6lS5cye/ZsTCYTFy9eJCEhgVatWl0zJiYmhgkTJgAwYcIEoqKiGD169C2PnZ2dzfTp0zl+/DhKqbIZq82bN/PEE09gb2/9KPr6+pbtM2rUqLL4V65cWfb84MGDcXBwoGXLlpjNZgYNGgRAy5YtSU5O5ujRoxw5coT+/fsD1tm2WrVqVXjc23lfhBBC3Nsu5F0gOjGaFcdXkGfMo11gO14Mf5He9XpXa1z3XRJWna6/iu7qYzc3N8A669i/f/9raqkqorXm1Vdf5fHHH7/m+eTk5LJj3Wqck5NT2WM7O7tbFvufPn2af/zjH+zbtw8fHx9mzJhxQ1sGs9nMihUrWL16Ne+++25ZD63c3NyfPTbA66+/Tu/evVm1ahXJyclERETccp+rr8HOzg6TyXTD8waDAQcHh7L32WAwYDKZ0FoTFhbG7t277+i4Qggh7i+HrxwmKiGKzWc2AzCg/gCmNp9Ky4CWaJMJXVQMrq7VFt99l4TdzoyVrZw9e5bdu3fTpUsXoqOj6d69Oz/++GPZ9s6dO/P0009z4sQJGjduTH5+PufPn6dp06bXHGfgwIFl9WHu7u6cP3++wm7ttzuuPA8PjwqTppycHNzc3PDy8uLy5ct89dVXNyRKW7ZsoVWrVmzcuLHsuenTp7Nq1SrCwsKuOe7158nOzqZOnTqA9eKAq/r3789///tfevfujb29PRkZGdfMhv0SISEhXLlypex3YTQaOXbsGGFhN/9s3Ox9EUIIcW8xWUxsObuFhQkLOXTlEB4OHkxrPo1JoZOo6VYTS1ERGdHRZMybj+eQwQT+/vfVFqvNCvOVUvOVUqlKqSM32d5MKbVbKVWslHrRVnFUpZCQED799FNCQ0PJzMzkySefvGZ7QEAACxYsYOLEibRq1YouXbqUFZyXN2DAACZNmkSXLl1o2bIlY8aMqTBBuN1x5c2YMYMnnnjihlYYrVu3pm3btjRr1oxJkyaVLZmWFxMTw8iRI695bvTo0cTExNCqVSvs7Oxo3bo1//rXv+jduzcJCQllhe4vvfQSr776Km3btr1m9mnmzJkEBQXRqlUrWrdufc0Vlr+Uo6Mjy5cv5+WXX6Z169a0adOGXbt2/ew+18crhBDi3pJXkkdUfBRDVw7lxe9fJKMog1c6vsLmsZt5IfwFAixupM2ew4l+/bn89l+w9/fHtUOHao1ZWWvGbHBgpXoCeUCU1rpFBdsDgfrAg0Cm1voft3Pc8PBwXb5nFUBiYiKhoaG/OuZfIzk5mWHDhnHkSIU5p7jH3Q2fMSGEEDc6n3eexYmLWXl8JfnGfNoFtmNa2DQi6kZgZ7DDlJZGRmQUmTExWPLycOveHb/HHsW1Q4cqacatlNqvtQ6vaJvNliO11tuUUsE/sz0VSFVKDbVVDEIIIYS4Px1MPUhUQhRbzm7BgIEBwQOY1nwaYf7W0pOSc+e4Mn8+WStWoktK8Bg4EL9HZ+LyM6UpVe2+qwmrLsHBwTILJoQQQtiQyWJi89nNLIxfyOG0w3g4ejAjbAYTm02kpltNAIqOHSN9zlxyNmwAgwHvB0fg98gjOAYHV2/wFbgnkjCl1GPAYwBBQUHVHI0QQgghqlJuSS4rj69kceJiLuZfJMgjiNc6vcaIRiNwdbBe3Vhw4EfSZ88mb+tWlKsrvtOm4TtjOg41alRz9Dd3TyRhWuvZwGyw1oRVczhCCCGEqAIpuSlEJ0az8vhKCkwFdKjZgVc7vkqver0wKANaa/K2bSN99hwK4uKw8/bG/9ln8J08GTtv7+oO/5buiSRMCCGEEL8NWmsOXjlIVHwU36Z8iwEDgxoMYmrzqTT3s97rWJvN5GzcQNqcuRQnJmJfsyY1XnsV7zFjMFRj3687ZbMkTCkVA0QA/kqpc8AbgAOA1vpzpVRNIA7wBCxKqVlAc6115ba7F0IIIcRdz2gx8k3yNyxMWMiR9CN4OnrycIuHmRAygRpu1iVFS0kJ2au+JH3ePIxnz+LYsCG1/vpXvIYNRTk6VvMruHO2vDpy4i22XwLq2ur81e3NN9/E3d2ddevW8Y9//IPw8GuvTl2wYAFxcXF88skn1RShEEIIUf1ySnJYcWwFixMXc7ngMsGewfyp058Y3mh4Wb2XOS+frCVLyFiwANOVKzi3aEHgRx/i0a8fymCzlqc2J8uRQgghhKhyKTkpLEpcxKoTqyg0FdKxZkde7/w6Per2wKCsiZUpI4OMhQvJXByNJScH1y6dqf2393Ht0qVKenzZmiRhlejdd98lMjKSwMBA6tWrR/v27QFYuHAhM2fOxGQyMX/+fDp27HjNfjNmzMDZ2Zm4uDhycnL45z//ybBhw6rjJQghhBA2o7XmQOoBouKj+C7lO+wMdgxpMISpzafSzLdZ2TjjhQukf7GArGXL0EVFePTvh9+jj+LSqlU1Rl/5JAmrJPv37yc2NpaDBw9iMplo165dWRJWUFDAwYMH2bZtGw8//HCF/cSSk5PZu3cvJ0+epHfv3pw4cQJnZ+eqfhlCCCFEpTNajGxK3kRUQhQJ6Ql4OXkxs+VMJjabSIBrQNm44pMnSZ8zl+x16wDwGj4cv5mP4NSoUXWFblP3XxL21Stw6afKPWbNljD4/Z8dsn37dkaOHIlr6VUZDzzwQNm2iROt5XE9e/YkJyeHrKysG/YfN24cBoOBJk2a0LBhQ5KSkmjTpk2lvQQhhBCiqmUXZ7P82HKik6JJLUgl2DOY1zu/zvBGw3GxdykbV3j4MOlz5pC7eQvKyQmfiRPxe2gGDrVrV2P0tnf/JWF3oevXrStax76dMUIIIcS94EzOGRYlLGL1ydUUmgrpVKsTb3R5g+51upfVe2mtKdi9m7TZcyjYsweDpyf+Tz6Bz9Sp2Pv4VPMrqBr3XxJ2ixkrW+nZsyczZszg1VdfxWQysXbtWh5//HEAlixZQu/evdmxYwdeXl54eXndsP+yZcuYPn06p0+f5tSpU4SEhFT1SxBCCCF+Ma01cZfjiEqI4vuU77E32JfVe4X4/u87TVss5H6zmfQ5cyg6cgT7gAACX3oJ73HjsHN3q8ZXUPXuvySsmrRr147x48fTunVrAgMD6dChQ9k2Z2dn2rZti9FoZP78+RXuHxQURMeOHcnJyeHzzz+XejAhhBD3BKPZyNfJX7MwYSGJGYn4OPnwWKvHmNBsAv4u/mXjdEkJ2WvXkT53LiWnT+NQP4iab7+F14MPYrgHe3xVBqX1vXUXoPDwcB0XF3fNc4mJiYSGhlZTRL/ejBkzGDZsGGPGjKnuUMRN3OufMSGEqGzZxdksO7aM6MRorhReoaFXQ6Y2n8qwhsNwtv/fRIKloICsZctI/2IBpkuXcAoNxf+xR/EYMABlZ1eNr6BqKKX2a63DK9omM2FCCCGEuG3J2cksSlzE6hOrKTIX0aVWF97u9jZda3ctq/cCMGdlkbFoMZkLF2LOzsa1Qwdq/eVt3Lp3vzvqnrUGsxHsq28WTpKwu8CCBQuqOwQhhBDiprTW7Lu0z1rvde57HAwODGs4jCnNp9DUp+k1Y42XL5PxxQIyly5FFxTg3rs3fo89imvbttUU/XWMhfDTcvjhvxA6DCJeqbZQJAkTQgghRIWMZiNfJX9FVHwURzOP4uvsy5Otn2RcyLhr6r0Aik+fJn3ePLJXrwGLBc+hQ/CbORPnpk1vcvQqln0O9s2D/QugMINLzg1JzvGlczWGJEmYEEIIIa6RWZTJsmPLiEmKIa0wjUZejXir61sMbTgUJzuna8YWxseTPnsOuZs2oRwd8Rk7Ft+HH8Kx7l1we2it4ewe+OFzdOJa0Jq9Tp35V0kfDpjCeMatiSRhQgghhKh+p7JPsShhEWtOrqHYXEy32t14t9u7dKl97b0atdYU7N1H+uzZ5O/cicHdHb9HH8V32lTs/f1/5gxVxFgE8Svhh8/h4iGK7DxYyjBmF/fB4FqfKYOC+E/7evi4Ve9VmZKECSGEEL9hWmv2XNzDwoSFbD+/HUeDI8MbDWdK6BQa+zS+dqzFQt5335E2ezZFhw5j5+9PwO9fwGfCBOw8PKrpFZSTcwH2zUPvX4AqSOO8QzCfGh/hy+JudAkJ4p0u9enZJACD4S64MABJwqrFm2++yZw5cwgICMBkMvHXv/71mtscVZVjx44xa9Ysjh8/joeHB40bN+bjjz8mJSWFqKgoPvroI7Zu3YqjoyNdu3at8viEEELYTom5hPWn1rMwcSHHM4/j6+zLU22eYlzTcfi5+F0zVhuNZK9fb+3xdeIkDnXrUvONP+M1ciSG6u5rqTWk7C1dclwDFjM77TryWcljJBnaMq57EBs7BVHP17V646yAJGHV5Pnnn+fFF18kMTGRHj16kJqaisFguPWOd8hkMmFvf+OvuaioiKFDh/LPf/6T4cOHA7B161auXLlCeHg44eHhZc+5u7tXSxJmNpux+w30kBFCiKqUUZTB0qNLiU2KJb0onSY+TXi769sMaTjkhnovS2EhWStWkjF/PsYLF3Bq2pTaH3yA5+BBqAq+W6qUqRiOXF1yPEihwZ0Y0yC+MPbFv14I07rUZ3CLWjg73L3fI5X/rf8bFhUVRatWrWjdujVTp04lOTmZPn360KpVK/r27cvZs2dv2Cc0NBR7e3vS0tJ48MEHad++PWFhYcyePbtsjLu7O88//zxhYWH07duXK1euAHDy5EkGDRpE+/bt6dGjB0lJSYC1+esTTzxBp06deOmll/j+++9p06YNbdq0oW3btuTm5hIdHU2XLl3KEjCAiIgIWrRowdatWxk2bBjJycl8/vnn/Otf/6JNmzZs376dK1euMHr0aDp06ECHDh3YuXMnQIXnAPjggw/o0KEDrVq14o033ig716JFi+jYsSNt2rTh8ccfx2w2l73W3//+97Ru3Zrdu3dX8m9ICCF+u05mneTNXW8yYPkAPj34KaF+oczuP5sVw1cwssnIaxIwc04OaZ9/zom+/bj8zjvY16xJ3c//Q4PVX+I1fFj1JmA5F+Hbd9H/CoMvnyAlNZ0/Gh+mm/ETjrZ6mf88M5pVT3VjZNu6d3UCBljXgu+lP+3bt9fXS0hIuOG5qnbkyBHdpEkTfeXKFa211unp6XrYsGF6wYIFWmut582bp0eMGKG11vqNN97QH3zwgdZa6z179uhatWppi8Wi09PTtdZaFxQU6LCwMJ2Wlqa11hrQixYt0lpr/dZbb+mnn35aa611nz599LFjx8qO07t3b6211tOnT9dDhw7VJpNJa631sGHD9I4dO7TWWufm5mqj0aiff/55/e9//7vC1/Ldd9/poUOH3hCr1lpPnDhRb9++XWut9ZkzZ3SzZs1ueo6NGzfqRx99VFssFm02m/XQoUP1999/rxMSEvSwYcN0SUmJ1lrrJ598UkdGRpa91iVLltzx+29rd8NnTAgh7pTFYtE7z+/Uj3/zuG6xoIVuv7C9fnPXm/pk5skKx5dcvqwv/f3vOqlde50Q0kyfeewxnb9vXxVHXQGLReuze7Ve9rC2vOWrLW946a1v9NGTXn1PR/z9Wz13+ymdlV9S3VFWCIjTN8lp7rvlyL/t/RtJGUmVesxmvs14uePLPzvm22+/ZezYsfiXXhXi6+vL7t27WblyJQBTp07lpZdeKhv/r3/9i0WLFuHh4cGSJUtQSvHRRx+xatUqAFJSUjh+/Dh+fn4YDAbGjx8PwJQpUxg1ahR5eXns2rWLsWPHlh2zuLi47OexY8eWLeV169aNF154gcmTJzNq1Cjq/orLhjdv3kxCQkLZ45ycHPLy8io8x6ZNm9i0aRNtSxv05eXlcfz4cQ4fPsz+/fvL7q9ZWFhIYGAgAHZ2dowePfoXxyeEEAKKzcVsOLWBqIQoTmSdwM/Zj2faPMO4kHH4OPvcML7k7FnS580ne9UqtMmE56BB+D32KM7NmlVD9OWYiiH+S/QPn6MuHKBAuRJt7M8i8wCahrbiiS716dbI/64ptL9T910Sdq+4WhN21datW9m8eTO7d+/G1dWViIgIioqKKtxXKYXFYsHb25uDBw9WOMbN7X93on/llVcYOnQoGzZsoFu3bmzcuJGwsDC+//77O47bYrGwZ8+eG24wXtE5tNa8+uqrPP7449eM/fjjj5k+fTrvvffeDcd3dnaWOjAhhPiF0gvTrfVeR2PJKMqgqU9T3un2DoMbDMbR7sZ2DEVJSaTPnkPO11+j7OzwGjUKv0cexjEoqBqiLyf3EsR9gSVuHob8K5xVdZhjfIjvnfswokcI0Z2CqO3tUr0xVoL7Lgm71YyVrfTp04eRI0fywgsv4OfnR0ZGBl27diU2NpapU6eyePFievTocdP9s7Oz8fHxwdXVlaSkJPbs2VO2zWKxsHz5ciZMmEB0dDTdu3fH09OTBg0asGzZMsaOHYvWmsOHD9O6desbjn3y5ElatmxJy5Yt2bdvH0lJSUyaNIn33nuP9evXM3ToUAC2bduGr6/vNft6eHiQk5NT9njAgAF8/PHH/OEPfwDg4MGDtGnTpsJzDBw4kNdff53Jkyfj7u7O+fPncXBwoG/fvowYMYLnn3+ewMBAMjIyyM3NpX79+r/qdyCEEL9VxzOPsyhxEetOrqPEUkLPuj2Z1nwaHWt2rPA+jQVxcaTNmUP+99swuLri+9AMfKdPx6F0VaLanNsPP3yOJX4VBouR7y1tmW96hOJ6PZjcpQF/blELR/v7p5z9vkvCqktYWBh//OMf6dWrF3Z2drRt25aPP/6Yhx56iA8++ICAgAC++OKLm+4/aNAgPv/8c0JDQwkJCaFz5//18HVzc2Pv3r288847BAYGsmTJEgAWL17Mk08+yTvvvIPRaGTChAkVJmH//ve/+e677zAYDISFhTF48GCcnJxYt24ds2bNYtasWTg4ONCqVSs+/PBD0tLSyvYdPnw4Y8aMYfXq1Xz88cd89NFHPP3007Rq1QqTyUTPnj35/PPPb3qOxMREunTpAliL7hctWkTz5s155513GDBgABaLBQcHBz799FNJwoQQ4g5ordl5YScLExay68IunO2cebDxg0xuPpmGXg0rHJ/3/fekz55D4YED2Pn4EDDrOXwmTsTOy6saXkEpUwkkrMay5z8YLuynQLkQa+zLMsMg2rYN57XO9Qmt5Vl98dmQstaM3TvCw8N1XFzcNc8lJiYSGhpaTRHZnru7O3l5edUdxm/a/f4ZE0LcO4pMRaw7tY5FCYs4mX2SAJcAJjabyNimY/F29r5hvDaZyPnqa9LnzKH42DHsa9fC76GH8R4zGoNLNS7p5aVC3BeY987FriCVZGox3ziAAz6DGNMllFHt6+Lp7FB98VUSpdR+rXV4RdtkJkwIIYS4B6QVprHk6BKWJC0hsziTZr7N+Gv3vzIoeBAOdjcmK5biYrJXriR93nyM587h2KgRtd5/D6+hQ1EO1ZjcnD+A5YfP4chKDBYj28ytibI8hEuz/kzp2oC3GvpVuIR6P5Ik7B4gs2BCCPHbdSzzGAsTFrL+1HpMFhO96vZiWtg0wmuEV5ismPPyyIyJISMyCnNaGs6tWlHjlZdx79MHZYOm4LfFbISE1Rh3f47DhX0U4cwSUx/WOw+jW7fOvNcxiJpe1dx5vxpIEiaEEELcZSzaws7zO4lKiGLPxT242LswqskopoROIdgruMJ9TOnpZERGkRkTgyU3F7euXfH7xz9w7VRxcX6VyLuC3v8Fxh/m4lhwmfO6JgtM0zhTbwRjuoYRE1YDB7v7p9D+TkkSJoQQQtwlikxFrD21loUJCzmdfZpAl0Cea/ccY5uOxcup4uL5knPnyZg/n6wVK9AlJXgMGIDfo4/i0iKsiqMv58KPmHb/FxW/HDuLkd3mVsQaHiKw7VAmd2lA0xp3wc2+7wKShAkhhBDV7ErBFWKPxrL06FKyirMI9Q3lvR7vMbD+wArrvQCKjx8nbc4cctZvAIMBrxEP4PfwIzg1bFDF0ZcyGyFxLUU7P8P54j5KcGKZKYJt3iOJ6N6dD9rWwd1J0o7y5N0QQgghqsnRjKNEJUSx4fQGzBYzEfUimNZ8Gu1rtL/pEmLhwYOkzZ5D3rffolxd8Z0yBd+HZuBQs2YVR18qPw1L3BeU7JmDc+FlLlsCWWiZSlbIOMZ2C2NaA9/fTKH9nZIk7C7w7rvvEh0djZ2dHQaDgf/+97906tSJ4OBg4uLiym6FdFXXrl3ZtWsXycnJ7Nq1i0mTJgHWxqkXLlxgyJAh1fEyhBBC3AaLtrD93HYWJizkh0s/4GLvwtimY5kSOoUgz4o71Wutyd+xk/TZsynYtw87Ly/8n3kGn8mTsPe58TZEVeLiIYp2fIZ9wkrsdQl7zS1Z7fQIwZ1H8FinYAI9fnuF9ndKkrBqtnv3btatW8eBAwdwcnIiLS2NkpKSn91n165dACQnJxMdHX1NEhYXF3dHSZjJZMLeXj4GQghha4WmQtacWMOixEUk5yQT6BrI8+2fZ3ST0Tet99JmM7mbNpE2Zw7FCYnY16hB4Csv4zN2LIZyt6erMmYTOnEteds+wSM1Dot2Isbck5/qjKNPj178LTQQ+99wof2dkm/fSvTggw+SkpJCUVERzz33HI888giPPPIIcXFxKKV4+OGHef7556/Z5+LFi/j7++Pk5ARww6wXWG9wPWrUKEaNGsWjjz5a1rz1lVdeITExkTZt2jBx4kQ+/fRTCgsL2bFjB6+++irDhg3j2Wef5ciRIxiNRt58801GjBjBggULWLlyJXl5eZjN5l90D0khhBC3J7UgldikWJYeW0p2cTZhfmH8rcff6B/cHwdDxfVelpISslevJmPuPErOnMExOJha776D1/DhKMcb7wFpc/nplOydj/GHObgVXSbTEsDnhmmY20xmTLcWTA10r/qY7gOShFWi+fPn4+vrS2FhIR06dKB9+/acP3+eI0eOAJCVlXXDPgMGDODtt9+madOm9OvXj/Hjx9OrV6+y7Xl5eUyYMIFp06Yxbdq0a/Z9//33+cc//sG6desAqFGjBnFxcXzyyScAvPbaa/Tp04f58+eTlZVFx44d6devHwAHDhzg8OHDN9wrUgghROVITE9kYcJCvkr+CrPFTJ+gPkxrPo22gW1vWiNlzssna+lSMhYswJSainNYGHU+/BCPfn1RdnZV/AqAi4fJ+f5TXI+uxFGXsNccxmbPx2jWczRPt62Hq6OkEb/GfffuXfrrXylOTKrUYzqFNqPma6/dctxHH33EqlWrAEhJSaGkpIRTp07x7LPPMnToUAYMGHDDPu7u7uzfv5/t27fz3XffMX78eN5//31mzJgBwIgRI3jppZeYPHnyHce9adMm1qxZwz/+8Q8AioqKOHv2LAD9+/eXBEwIISqZRVvYdm4bUQlR7Lu0Dxd7F8aHjGdys8nU86x30/1MmZlkLlxIxuJoLNnZuHbuTK33/opb165VX9RuNmFOXEf21o/xTYvDQTuyzNKTM40n0z8igjeCfKTQvpLcd0lYddm6dSubN29m9+7duLq6EhERQXFxMYcOHWLjxo18/vnnLF26lLfeeovhw4cD8MQTT/DEE09gZ2dHREQEERERtGzZksjIyLIkrFu3bnz99ddMmjTpjj/0WmtWrFhBSEjINc//8MMPuFVHLYEQQtynCowFrD65msWJizmTc4aabjX5ffvfM6rpKDwdb37zaePFi6R/8QVZy5ajCwtx79cX/0cfxaV16yqMvlRBBnm75qH3zcGj+DL5lgAWO87ApeN0Huwahr+7U9XHdJ+775Kw25mxsoXs7Gx8fHxwdXUlKSmJPXv2kJaWhsViYfTo0YSEhDBlyhTq1avHwYMHy/Y7evQoBoOBJk2aANbi+vr165dtf/vtt3n77bd5+umn+eyzz645p4eHB7m5uTd9PHDgQD7++GM+/vhjlFL8+OOPtG3b1kbvgBBC/PZcyr9ETFIMy48tJ6ckh5b+Lfmg5wf0rd/3pvVeAMWnTpE+Zy7Za9cC4DVsGH6PzsSpUaOqCr2MvvQTqZs/xufkKtx1CTvNYeyr8RRhEeN4qnkt7Awy62Ur910SVl0GDRrE559/TmhoKCEhIXTu3Jnz588TERGBxWIB4L333rthv7y8PJ599lmysrKwt7encePGzJ49+5oxH374IQ8//DAvvfQSf//738ueb9WqFXZ2drRu3ZoZM2Ywffp03n//fdq0acOrr77K66+/zqxZs2jVqhUWi4UGDRqU1Y8JIYT45eLT41mYsJCNpzdiwULfoL5Maz6N1gGtf3bVovCnn0ifPYfczZtRTk74TJiA30MzcKhTpwqjByxmCo+sJfu7j6mZGYendmSN6kl68+kM6NOXWf6yWlIVlNa6umO4I+Hh4TouLu6a5xITEwkNDa2miMRvgXzGhBBmi5mt57ayMGEh+y/vx9XelVFNRjE5dDJ1PeredD+tNQV79pA2ezYFu/dg8PTEZ/IkfKdOxb6qa3MLMkjdNheH/fPwMV7inPZnk9sD+HZ/hIHhobg4VkPx/31OKbVfax1e0TaZCRNCCCF+RoGxgFUnVrE4cTEpuSnUcqvFi+EvMqrJKDwcb34PRG2xkLt5M+lz5lL000/YBwQQ+Ic/4D1+HHbuVdvSwXjhCBc2fUjN5NUEUsweS3MSg56mbf9JPBTkJ4X21USSMCGEEKICl/IvEZ0UzfJjy8ktyaVVQCuea/ccfYP6Ym+4+denLikhe+060ufOpeT0aRyCgqj51lt4PTgCg1MVFrdbzGQcXEPe958QlB1HDe3AJvteFLWbSd+IvnR2q4Z+Y+IakoQJIYQQ5RxJO0JUQhSbkjeh0fQL6sfU5lNpE9jmZ/ezFBSQtXw56V8swHTxIk7NmlHnn/+Hx8CBVdrjSxdkcmbzf3E//AX+pksUaj+W+c6kZsRjDGnZVArt7yI2S8KUUvOBYUCq1rpFBdsV8CEwBCgAZmitD/zS82mtZTpV2MS9VjcphLhzZouZrSlbiUqI4kDqAdwc3JgcOplJoZOo4/7zRfPmrCwyFi8mc+EizFlZuIaHU+vtt3Dr3r1Kv5fyzh3h3Nf/pv65NQRTzH5C2dbkd4QPmMrYgJu3yRDVx5YzYQuAT4Com2wfDDQp/dMJ+E/pf++Ys7Mz6enp+PnJuraoXFpr0tPTcXaWG9EKcT/KN+bz5YkvWZSwiHN556jjXoeXOrzEyMYjcXf8+bot4+XLZCyIJGvJEiwFBbhHROD32GO4tqvCVkAWM2d/+JLinZ/RJC+OBtqB7S69oeNjdO/Rh/YOUmh/N7NZEqa13qaUCv6ZISOAKG2dZtijlPJWStXSWl+803PVrVuXc+fOceXKlV8arhA35ezsTN26N7/ySQhx77mcf5nFSYtZfnQ5ucZc2gS04fn2z9MnqM/P1nsBlCQnkz5vHtlfrkZbLHgOGYLfzJk4hzStouihOC+DY19/TkBiJEHmS1zSvnxV8zHq93+Kfo0bVFkc4tepzpqwOkBKucfnSp+74yTMwcGBBg3kQyeEEOLnHc04SmR8JF+d/goLFvoF9WNamLW/160UJSSQNnsOuRs3ohwc8BozGr+HH8ax3s1vR1TZLp86xIWNHxJyeT0tKeKwIZT4Fi/QfuAUBntIb697zT1RmK+Uegx4DCAoKKiaoxFCCHEv0Vqz88JOIuMj2XNxDy72LkxoNuGW/b2u7luwdx/pc+aQv2MHBnd3/GbOxHf6NOz9/askfovZTMK25ai9/yWscD/e2p44j764dH+SNh0jMEih/T2rOpOw80D5fz7ULX3uBlrr2cBssDZrtX1oQggh7nUl5hLWn1pPVEIUJ7JOEOASwKx2sxjTdAxeTl4/u6+2WMjbupX0/86m8NAh7Pz8CHj+eXwmTcTO4+a9wSpTdmY6CRs+I+jEIlroS6Tiy7Z6T9B40NN0qyMTEveD6kzC1gDPKKVisRbkZ/+SejAhhBCivOzibJYeXUp0UjRphWk08WnCu93fZXDwYBzsbn4/RwBtNJKzYQPpc+dSfPwEDnXqUOPPr+M9ahSGKrpA51j8fq5s+Zg26RvooopJcmjOvtZ/oFX/KfR0kouE7ie2bFERA0QA/kqpc8AbgAOA1vpzYAPW9hQnsLaoeMhWsQghhLj/peSksDBxIV+e+JJCUyFda3fl3e7v0qVWl1teOW8pKiJrxQoy5s3HeOECTk2aUPuDv+M5eDDK3vbzFUUlRvZvWYbLgbm0M+4nWNtzxLcfXhFP06x1T5ufX1QPW14dOfEW2zXwtK3OL4QQ4rfhYOpBohKi2HxmM3YGO4Y0GMK05tMI8Q255b7mnBwyo2PIiIrCnJGBS5s21PjTn3CP6IUyGGwe+7mLl0n8+j+EnImhG5dIUz4caPQUTYY8Qzu/Kr6pt6hy90RhvhBCCFGe2WLmu5TviIyP5OCVg3g4evBwi4eZFDqJQNfAW+5fdPQomYujyV67Fl1YiFuPHvg/9igu4eE27zdpsWj27t9L3rbP6JzzNf1VEaecmnM0/GWa9p6Mv30V3tpIVCtJwoQQQtwzCowFrD65moUJC0nJTaGOex1e6fgKIxuPxNXB9Wf31SUl5G7eTEZ0NIVx+1FOTngOG4rvlCk4h4baPPbMvCJ2b1qK75Ev6Gw5gBF7jgf2J7DfszQM6Wbz84u7jyRhQggh7npphWlEJ0az9NhSsouzaeV/ezfTBmtn+6wlS8lcthTzlTQc6tUj8A9/wHv0KOy8vW0e++GT5zjxzWzaXFzKEHWRLIMPR5s9Q4NBz9Dcu5bNzy/uXpKECSGEuGudyDxBVEIU606tw2Qx0bteb2a0mEGbgDY/u2yotaZg3z4yF0eTu3kzWCy49eyB76RJuPXoYfN6ryKjmW937cG0+7/0LtxEK1VIilsoFzq/Su2uE/G2d7Tp+cW9QZIwIYQQdxWtNT9c+oHI+Eh2nN+Bs50zo5qMYmrzqdT3rP+z+5rz8slZu4bM6GiKj5/A4OWF7/Tp+EwYj2MVNPtOvpLHrm+WUfdYFIP0j5iVHSm1B2Do/xz1Gna2+fnFvUWSMCGEEHcFo8XI16e/JiohiqSMJHydfXm6zdOMDxmPj7PPz+5bfOqUtdD+yy+x5Ofj3Lw5td59B88hQzC4uNg0brNFs+2n05zdOp9u6SuYZLhAjp0PF8KepU6/p2joKUuOomKShAkhhKhWuSW5LD+2nMWJi7lccJmGXg15q+tbDG04FCe7m18pqE0mcr/7jszoaAp270E5OOAxeBC+kybh3Lq1za9yTM8r5qvtu7CPm8cQ02Z6q0Iue4aS3f01vMLH4SlXOYpbkCRMCCFEtbiQd4FFiYtYcWwFBaYCOtbsyJ+7/JnudbpjUDev2TKlp5O1bBmZS5ZiungR+1q1CJg1C++xY7D387NpzFprDpzJ5IctKwg5E8MkdQCLMpBabyCu/Z+jRlAnsHHyJ+4fkoQJIYSoUvFp8UTGR7LpzCYABgYPZHrYdJr7Nb/pPlprCg8eJDM6hpyvvwajEbeuXaj5x9dwj4iweVf7ghIT6+JOkLojkoF5q3nKcJ58R2+yWv8O315PUNuztk3PL+5PkoQJIYSwOYu2sO3cNiLjI4m7HIebgxtTQqcwOXQytdxvXjNlKSwkZ/16MqKjKU5IxODujs/48fhMmohTw4Y2j/vklTzWfb8L7yORjNTf4qkKSPcOpbjna7i1HoObg9zLUfxykoQJIYSwmSJTEWtPrSUqPorknGRqutXkxfAXGd1kNO6O7jfdr+TsWTJjYslauRJLdjZOTZpQ88038Bo+HIObm01jNpktbE64zI/fryb88lKeNRxAK0V2gyHoPs/iV0+WHEXlkCRMCCFEpcsoymBJ0hJij8aSUZRBqG8o7/d4nwHBA3AwOFS4j7ZYyNu2jczoaPK37wA7Ozz69cN38qQquZ1Qam4RK3YfI2fvIh4sWc8gwzkKnb0pbDcLt66P4usl93IUlUuSMCGEEJXmdPZpFiYsZM3JNRSbi+lZtyczwmYQXuPmSZQ5K4usFSvJjI3FmJKCXYA//k89hfe4cTjUuPV9IH8NrTX7kjNZv30P9Y4vZpLhW7xUATm+oZh7fopLyzEgS47CRiQJE0II8atordl/eT+RCZFsTdmKo8GR4Y2GM635NBp637xuq/BIPJnR0eSsX48uLsYlvD2Bz8/Co18/lKNtO8obzRbWHTrP7m/X0Cd7FX+2i0PZGShoNBh6PoNnUGdZchQ2J0mYEEKIX8RkMbH5zGYi4yM5kn4EbydvHm/1OBOaTcDfxb/CfSwlJeR+9RUZ0dEUHTqMcnHB68EH8Zk0CeeQpjaPObfIyNI9pzi/YyGjS9Yw0nCGYmdvLB1m4dBpJu5edW0egxBXSRImhBDijuQb81l5fCWLEhZxIf8C9T3r83rn1xneaDgu9hV3pzdeuEBm7BKyli/HnJGBY3AwNV57Da+RD2Ln4WHzmC9mF7Jk60EMBxYwga8JVFnkezfC0vNDnFqPBwfbdtUXoiKShAkhhLgtl/MvszhpMcuPLifXmEu7wHa83PFlIupFVNhcVWtNwe7dZCyOJu+77wBw790bn0kTcevSxeY30QZIvJjD6m++pf7xKJ4wbMNZGcmt0xMinsOtcV9ZchTVSpIwIYQQP+toxlEi4yP56vRXWLDQL6gf08Om0yqgVYXjzbm5ZK/6ksyYGEpOn8bOxwe/mTPxGT8Ohzq2v8JQa82O41fY9c1yOl2K5RW7Q5jsHSlqPhZ6/Q6PwFCbxyDE7ZAkTAghxA201uy8sJPI+Ej2XNyDi70LE5pNYHLoZOp6VFw3VXT0GJnR0WSvXYsuKMC5dStq/+19PAYNwuBk+/soGs0WNvx4mlPfLmBw3ipeNqRQ4OxHUadXcO7yKO5uFdepCVFdJAkTQghRpsRcwvpT64lKiOJE1gkCXAKY1W4WY5qOwcvJ64bx2mgkd/NmMhdHUxAXh3JywnPoUHwmTcKlRViVxJxbZGT1joMU7Z7Ng6avGaFyyPJqijHiE1xbjwO5kba4S0kSJoQQguzibJYeXUp0UjRphWk08WnCu93fZXDwYBzsbmyuarycStbSpWQtXYrpyhUc6tYl8A8v4jVqFPY+PlUS88XsQtZ/sxm/n+Yylh04KRNpdSKw9Hke70a9pN5L3PUkCRNCiN+wlJwUFiYu5MsTX1JoKqRr7a682/1dutTqckNzVa01hXFxZERHk/vNZjCZcOvZg5p/eRv3Hj1QdnZVEnPihSx2fBVL6JmFzDQcocTgRE6zCTj1nYW/f5MqiUGIyiBJmBBC/AYdTD1IVEIUW85uwaAMDGkwhGnNpxHiG3LDWEt+Ptlr15IZHUPxsWMYPD3xnTIFn4kTcKxfv0ri1VqzO+kcRzfNpmf6Mh41XCTHyZ+s8Ffx7vEY/q6+VRKHEJVJkjAhhPiNMFvMfJfyHZHxkRy8chAPRw8eCnuISaGTCHS98fZAxadOkxkTQ/aqVVjy8nBqHkqtd/6C59ChGFyqpq+W0Wxh8w8Hyd72GQMLv6KryiPVM5SCXn/Gs+0YsLdtZ30hbEmSMCGEuM8VGAtYfXI1CxMWkpKbQh33OrzS8RVGNh6Jq4PrNWO1yUTe1q3Wm2jv2g0ODngOHIjP5Em4tGlj85toX5VbZOSbLZtw2f9f+pp3YK8sXKjZG7eBvyewQTep9xL3BUnChBDiPpVWmEZ0YjRLjy0luzibVv6tmNVuFn2D+mJnuLZ+y5SRQday5WQuicV04SL2NWsSMOs5vMeMwd6/6lo7XMzMY+eGxQQf+4JRKpFC5cLlkMnUGfA8df1vfh9KIe5FkoQJIcR95kTmCaISolh3ah0mi4ne9Xozo8UM2gRcO5Oltabo0CFrof1XX6ONRly7dKbGq6/i0bs3yr7qviKSzl4kYf1ntL+0hDHqMhkONbjQ7o/U7v0Y9Vy8qywOIaqSJGFCCHEf0Frzw6UfiIyPZMf5HTjbOTOqySimNp9Kfc9ri+ctRUXkrF9P5uJoihISMLi54T1uHD6TJuLUqFGVxrzv0E9c2fIRPXLW00wVkOIeRlqPt/HvMAbs5CtK3N/kEy6EEPcwo8XI16e/JiohiqSMJHydfXm6zdOMDxmPj/O1/bpKUlLIjIkle8UKzNnZODVpTM03/ozn8Aewc3erupjNFnZu/Rq15zO6lexEKTgd2AfDgN9Tr0nXKotDiOomSZgQQtyDcktyWX5sOYsTF3O54DINvRryVte3GNpwKE52/+sQry0W8nfsIGPxYvK3bQeDAY9+/fCZNAnXjh2qrNAeILegkB82RFEjfh4R+ij5uHKy0VSCh7xAY//gKotDiLuFJGFCCHEPuZB3gUWJi1h5fCX5xnw61uzIn7v8me51umNQhrJx5qwsslauIjM2FuPZs9gF+OP/5JN4jx+HQ40aVRrzpdTLHFn7CaFnY+inrnDZribHWv+Jxv0fJ8TFs0pjEeJuIkmYEELcA+LT4omMj2TTmU0ADAweyPSw6TT3a37NuML4eDKjo8lZtx5dXIxL+/YEznoOj379UI5V21PrxNEjnN/4b9qnr6OfKuSEayuSu71DcNex1DBUTXd9Ie5mkoQJIcRdyqItbDu3jcj4SOIux+Hm4MaU0ClMDp1MLfda/xtXUkLuxo1kLo6m8OBBlIsLXiNG4DNpIs7NmlVpzNpi4afdmyje8THtCnZSHwOJfn0JHPACjZt1qdJYhLjbSRImhBB3mSJTEWtPrSUqPorknGRqutXkxfAXGd1kNO6O7mXjjBcvkhm7hKxlyzBnZOBYvz41Xn0Fr5EjsfOs2mU+Y0kxBzcuwOvgHFqZj5ONOweDptN42Au0qlE1tzYS4l4jSZgQQtwlMooyWJK0hNijsWQUZRDqG8r7Pd5nQPAAHAwOgLWtQ8GePWRGR5O75VsA3CMi8Jk0CbeuXVAGw8+dotLlZl0hce1H1D+5mA6kc85Qm/0t/kSLIY/T3lXqvYT4OZKECSFENTudfZqFCQtZc3INxeZietbtyYywGYTXCC+7etGcl0f2qi/JjImh5NQp7Ly98XvkEbzHj8exbp0qjzk1+QgpG/5J6OV1dFTFHHFsw+VOf6VFxFjq2km9lxC3Q5IwIYSoBlpr9l/eT2RCJN+nfI+DwYHhjYYzrfk0Gnr/7/Y8xcePkxEdTfbqNeiCApxbtaLW++/hOXgwBiennzmDTYLmzP6N5G79kOa5u/HCjoNeffHpO4sWraW/lxB3SpIwIYSoQiaLic1nNxN5JJIj6UfwdvLmsVaPMaHZBPxdrPdo1EYjuVu2kLk4moJ9+1COjngOGWK9iXbLllUeszYVc3xLJE5xn1PfeJIM7cH22jNoPGQWneoFV3k8QtwvJAkTQogqkG/MZ+XxlSxKWMSF/AvU96zP651fZ3ij4bjYuwBgTE0la+kyspYuxZSaikOdOgS++Hu8Ro/G3sfnFmeofMbcKxxf/xE1jy6kqc7kFHX5tumfaDf0MXp5eVV5PELcbyQJE0IIG7qcf5nFSYtZfnQ5ucZc2gW24+WOLxNRLwKDMlgL7ePirL29Nn0DJhNuPXpQ8603ce/ZE1UN9VX55+I5u+H/aHhhLc0pYZ9dW35q/z6d+4+hoYN8bQhRWeT/JiGEsIGjGUeJjI/kq9NfYcFCv6B+TA+bTquAVgBY8vPJXLuOzJgYio8exeDpie/kyfhMnIBjcHDVB6w16T99TdaWf9Moew8NtAM73Prh1utZOnboisFQdbc3EuK3wqZJmFJqEPAhYAfM1Vq/f932+sB8IADIAKZorc/ZMiYhhLAVrTU7L+wkMj6SPRf34GLvwoRmE5gcOpm6HnUBKD59msyYGLJXfYklNxen0FBq/uVtvIYOxeDqWvVBG4u4sCMKteczahWfxqK9WOf/EA0GPUvfJo2qPh4hfkPuKAlTSrlqrQtuc6wd8CnQHzgH7FNKrdFaJ5Qb9g8gSmsdqZTqA7wHTL2TmIQQorqVmEtYf2o9UQlRnMg6QYBLALPazWJM0zF4OXmhzeayQvv8XbvAwQHPAQOshfZt21bpTbSv0rmXSdn0Md7xC6ltySJJ1+eHBn+i/ZCZDAus+vozIX6LbisJU0p1BeYC7kCQUqo18LjW+qmf2a0jcEJrfar0GLHACKB8EtYceKH05++AL+8oeiGEqEbZxdksPbqU6KRo0grTaOLThHe7v8vg4ME42DlgysggLXIOmbExmC5cxL5GDQKe+x3eY8ZgHxBQLTEbL/zEha//j9pn1xKEie2qPemtZhIxcDTN3Kq45YUQv3G3OxP2L2AgsAZAa31IKdXzFvvUAVLKPT4HdLpuzCFgFNYly5GAh1LKT2udfptxCSFElUvJSWFh4kK+PPElhaZCutbuyrvd36VLrS4opSg8fJgrixeTs+ErtNGIa6dO1HjlFTz69EHZV0MprsVCQcJXZG75N3Uy9xKgndjgOACHrk/Rt0dXnOyluaoQ1eG2/zbQWqdcN2VuroTzvwh8opSaAWwDzld0XKXUY8BjAEFBQZVwWiGEuHOHrhwiMj6SLWe3YFAGhjQYwrTm0wjxDcFSVGTtaB8dTdGRIxhcXfEeOxafSRNxaty4egIuKSB7z0JMuz7Fr+gM2dqXaM+HqNvvSYa3bCrF9kJUs9tNwlJKlyS1UsoBeA5IvMU+54F65R7XLX2ujNb6AtaZMJRS7sBorXXW9QfSWs8GZgOEh4fr24xZCCF+NbPFzNaUrSyIX8DBKwfxcPTgobCHmBQ6iUDXQErOnePyvA/IXr4Cc3Y2jo0aUeP1P+E1YgR27u63PL5N5Fwg7btPcDm8EC9zDoctDVlb+4+0G/IQk4KqZxlUCHGj203CnsC6ZFgHayK1CXj6FvvsA5oopRqU7jMBmFR+gFLKH8jQWluAV7FeKSmEENWuwFjA6pOrWZiwkJTcFOq41+GVjq8wsvFIXOycyd+5k5RFfyZv2zYwGPDo2xefSZNw7dSxWgrtAfSFH7my6d/4Jq/FV1vYQjgpIQ/Tf+AIZvi5VUtMQoibu60kTGudBky+kwNrrU1KqWeAjVhbVMzXWscrpd4G4rTWa4AI4D2llMa6HHmrxE4IIWwqrTCN6MRolh5bSnZxNq38WzGr3Sz6BvWF3DyyFi3lQmwMxjNnsfP3x//JJ/AeNw6HmjWrJ2CLGVPiBrK+/Tf+6XG4ameW2Q3CFP4YwyO60t/VsXriEkLcktL61qt7pbNZzwLBlEvctNYP2CyymwgPD9dxcXFVfVohxH3uROYJohKiWHdqHSaLid71ejOjxQzaBLShOCmJzOhosteuQxcV4dKuHT6TJuE5oD/KsZqSnOI8ivZFUbzzM7wKUzin/VnnPJwavR5jSMcQKbYX4i6hlNqvtQ6vaNvtLkd+CcwD1gKWSopLCCGqldaaHy79QGR8JDvO78DZzplRTUYxtflUgpxrkbNxE2eiP6Dwxx9Rzs54DR+Oz6SJOIeGVl/QWSnkb/8Mu4NROJvziLc0YZvfa7TpP4XHmtWWYnsh7iG3m4QVaa0/smkkQghRRYwWI1+f/pqohCiSMpLwdfbl6TZPMz5kPO6ZRWTOX8LxZcsxp6fjUD+IwFdexnvkSOyq86bV5+LI+e7fuJ3cgJPWfG3pSFLwVAYMHMbzdb2rLy4hxC92u0nYh0qpN7AW5BdffVJrfcAmUQkhhA3klOSw4tgKFiUuIrUglYZeDXmr61sMaTAEc9xBMv/wBpe+/RYsFtwjIvCZNAm3bl1RBkP1BGw2oRPXkrv1QzzTfgTtSiRDyG75EGP6dGGYbzXc5kgIUWluNwlrifV2Qn3433KkLn0shBB3tfN551mUsIiVx1dSYCqgU81OvNHlDbrV6Eze15u48OIEio8exc7bG7+HZuA9YQKOdetWX8BF2Zj3R1G84zNcCy+QaQnkv/YP49VlBuO6heItxfZC3BduNwkbCzTUWpfYMhghhKhMR9KOEBkfyTdnvkGhGNhgINObTyfENZislSs5Pf9tjOfP49ioEbXefRfPYUMxOFXjrXsyTlOy6z/w4yIczfn8ZGnGercZtOw9nt+1qyfF9kLcZ243CTsCeAOptgtFCCF+PYu28H3K90QmRLL/8n7cHdyZ1nwak0InEWByITM6mhMLF2HOyMClTRtq/PE13CMiqm/JUWs4u4ei7R/jeOIrlFastXRhX43x9O83kDebBkqxvRD3qdtNwryBJKXUPq6tCavyFhVCCFGRIlMRa06uYWHCQpJzkqnlVos/hP+BUU1G4ZSZT8YnkZxYsgRLQQFuvXri/+ijuLRvX22NVTEbIf5LCrd/hMuVwxRqd+abh3G+yRTG9+3EKCm2F+K+d7tJ2Bs2jUIIIX6h9MJ0Yo/GsiRpCZnFmYT5hfH3nn+nf/3+mJPPkv7We2SvWQsWC55DhuA38xGcQ0KqL+DCTHTcAkp2fY5T4SUuWGqxiJnYt5vEtJ6h1JNieyF+M263Y/73tg5ECCHuxKnsU0TFR7H25FpKLCVE1I1geth02tdoT9FPP3HpuefJ3bwF5eiIz7hx+D70EI5161RfwGknMO/5DP1jNPbmQvaZw1jm8DBNe47kuc7BUmwvxG/QzyZhSqkdWuvuSqlcrFdDlm0CtNba06bRCSFEOVpr4i7HsSB+AdvObcPJzokRjUcwtflUgj2Dyd+xk7MvP0TBDz9g8PTE74nH8Z0yBXs/v+oKGJK3Y9r5CXYnNmHGjtWmrnzjNZp+EX35e9vaUmwvxG/YzyZhWuvupf/1qJpwhBDiRkaLkU3Jm4iMjyQxIxFfZ1+eav0U45uNx8fek5yNGzk99/cUJyZiX6MGgS+/jPfYsdi5V9NNq03FcGQFxp2f4nDlCNl4ssg0kvjaY5jQJ5zPpdheCMFtLkcqpRZqrafe6jkhhKhMeSV5rDhuba56Kf8SwZ7B/LnLnxnecDiOJsj+8ktOzpuPMSUFxwYNqPXuu3gNH1Z993PMT4e4+Zh+mI19QSqndV3mmx6lKHQMD/VqxnP1vKsnLiHEXel2C/PDyj9QStkD7Ss/HCGEgIt5F1mcuJjlx5eTb8wnvEY4f+r0J3rU7YHOzSNzXhQZUVGY09NxbtWKwJf+gEffvtXXZiI1Cb3nMyyHYrEzF7PD3JqFzKRe+FCe7tFQiu2FEBW6VU3Yq8BrgItSKufq00AJMNvGsQkhfmPi0+OJjI9kU/ImAAYED2B62HTC/MIwXk7lyj/+j6zYJVjy83Hr3h2/Rx/FtWOH6mkzoTWc/BbL7k8xnNxCCY6sMHVnldMDRPTqwf91CpJieyHEz7pVTdh7wHtKqfe01q9WUUxCiN8Qi7aw/dx2IhMi2XdpH24ObkwOncyU0CnUcq9F8enTXPz362R/uRptNuM5aJC1zUTz5tUTsLEIDi/BvPsz7NKSyMCbBcax7PIZwYRebVgkxfZCiNt0uy0qXlVK1QHql99Ha73NVoEJIe5vxeZi1p5cS1RCFKezT1PDtQa/b/97RjcdjYejB4U/HeHc3L+Ru2kTysEBrzGj8XvoIRyDgqon4LxU2DcX89652BWmc5z6zCl5gktBQ3gkohkvSLG9EOIO3W5h/vvABCABMJc+rQFJwoQQdySjKIMlR5cQmxRLRlEGob6hvN/jfQYED8Be2VOwezdn5syhYPceDB4e+D32GL5Tp2Dv7189AV86Ans+w3J4GViMbLW0ZZ7pCXzD+vJoz0a0lmJ7IcQvdLuF+SOBEK118S1HCiFEBU5nn2ZhwkLWnFxDsbmYnnV7Mr35dDrU7AAWC7mbviF99hyKEhKwDwgg8A9/wHv8OOzc3as+WIsFTnyD3v0p6vT3FCtnYo29iFFD6dyhI3/r3kCK7YUQv9rtJmGnAAfK3TdSCCFuRWvN/sv7iUyI5PuU73EwODC80XCmNZ9GQ++GWIqLyVq6jPT58zCeOYtjcDA1//I2XiNGYKiONhMl+XAoBr3nc1T6cdKVH3ONE9jkPIjRfVsSK8X2QohKdLtJWAFwUCm1hWtv4P07m0QlhLinmSwmNp/ZTGR8JEfSj+Dt5M3jrR9nfMh4/F38MefmkjZnjrXNxJU0nFu0IPDDD/Ho1xdlVw1F7TkXYe9sdNwXqKJMklQj/lPyDEm+vXlkSFM2tKmDs4MU2wshKtftJmFrSv8IIcRN5RvzWXFsBYsTF3Mh/wL1PevzeufXGd5oOC72LpiuXCH1s3+SGRODJS8Pt65d8fv733Ht3Ll62kxc+BF2f4aOX4m2WNhCOP8tHoRd/S48HtGICCm2F0LY0O1eHRlp60CEEPeuS/mXiE6MZvmx5eQac2kX2I6XO75MRL0IDMpAyZkzXJz/BdmrVqFNJjwGDsDvkZm4tAi79cErm8UMRzdY673O7qbQ4EqMsT+RpgG0bNmG13s0lGJ7IUSVuN2rI09z7Q28AdBaN6z0iIQQ94ykjCQi4yP5+vTXWLDQv35/pjefTsuAlgAUxseTPncuuRs3oezs8Bo1Cr+HH8Kxfv2qD7Y4F35chGXP5xiykrlsqMEc4xS+cujHsC7NWNS5vhTbCyGq1O0uR4aX+9kZGAv4Vn44Qoi7ndaaHed3EBkfyQ+XfsDV3pUJzSYwpfkU6rjXQWtN/p49pM+eQ/6uXRjc3fF75GF8pk7FITCw6gPOOgs//BfL/kgMJbkcJoTPS2aR7N+LaQMbs7ltbVwdb/evQiGEqDy3uxyZft1T/1ZK7Qf+XPkhCSHuRsXmYtafWk9UfBQns08S6BrI8+2fZ0zTMXg6eqLNZnI2biJ9zhyKjhzBLsCfgN+/gM+ECdh5eFR9wCl70bs/hcQ1WDRsMHdinnkwgc26MaNrMF0a+VVPHZoQQpS63eXIduUeGrDOjMk/HYX4DcgqymLJ0SXEJMWQXpROiE8If+3+VwYFD8LBzgFLSQlZy5eTPnceJcnJOAQFUfOtt/B6cAQGJ6eqDdZsgsTVWHZ9iuHCfvKVG4uMQ1lhP5jeXdrxsSw5CiHuIrebSP1fuZ9NQDLWJUkhxH3qbM5ZohKiWH1iNUXmIrrV6caMsBl0qtkJpRTmvDzSlywkIzISU2oqzs2bU+df/8RjwICqbzNRmAUHIjHv+S92uec5R03mGGdwyG8wE7o1Z7UsOQoh7kK3uxzZu/xjpZQd1tsYHbNFUEKI6qG15uCVgyw4soDvUr7D3mDPsIbDmNZ8Go19GgNgSksjY+Eia5uJnBxcu3Sm1nt/xa1r16pf3ks/id7zHyw/LsbOVMBeS3PmmSdg13QQ07s15G1ZchRC3MV+NglTSnkCTwN1gNXA5tLHvwcOA4ttHaAQwvZMFhNbzm4hKj6Kw2mH8XLyYmbLmUwKnYS/i/WejSUpKaTPn0/2ylXokhI8+vfH79GZuLRsWbXBag1ndmLe9SmGY19hwo415i7E2g2nbeeevCFLjkKIe8StZsIWApnAbuBR4I+AAkZqrQ/aNjQhhK0VGAtYdWIVCxMWcj7vPPU86vFap9cY0WgErg7WRKYoMZH0OXPJ+fpra5uJB0fg+/DDODVoULXBmkogfiXGnZ/gkPoTOXiw0DSCXT4P8kD39kTKkqMQ4h5zq7+xGmqtWwIopeYCF4EgrXWRzSMTQthMakEq0YnRLD22lNySXNoGtuUP4X8gol4EdgY7a5uJH/aSPncu+du3Y3B1xfehGfhOm45DjSpuM5Gfjo6bj3HPbBwLU0nWdZhvmklOk1FM7h7Cs7LkKIS4R90qCTNe/UFrbVZKnZMETIh719GMo0QlRLHh9AYs2kLfoL5MD5tO64DWAGiLhdzNm0mbM4eiQ4ex8/MjYNYsfCZOwM7Lq2qDvXIU065PUYdisbMUs9vcili7mdTrOJynugTLkqMQ4p53qySstVIqp/RnBbiUPlaA1lp72jQ6IcSvprVm14VdRMZHsvviblzsXRjXdBxTmk+hnkc965iSErLXriN93jxKTp3CoV49ar7xZ7xGjsTg7FyVwcKp7yja/jHOyd9ixoGVpu586z2a3j168X+y5CiEuI/87N9mWusqvs5cCFFZSswlbDi9gaiEKI5nHifAJYDn2j3H2KZj8XKyzmqZ8/LJWraMjAULMF2+jFOzZtT+v3/gOXAgyr4Kkx1jEfrwEgq3f4pr1lFytRefmcdwrtFExvRow2xZchRC3Ifkn5RC3Geyi7NZdmwZ0YnRXCm8QhOfJrzT7R2GNBiCg50DAKaMDDIWLiQzOgZLdjauHTtS6513cOverWqTnbxUTHtmY9o7D+eSDJIt9Yk2PIVHhwlM6tpElhyFEPc1ScKEuE+k5KSwMHEhX574kkJTIV1rd+Wdbu/QpXaXssSq5Nx5Mr74gqwVK9DFxXj064vfzJm4tG5dtcFeOkLBto9wTFyJQZvYbm7LJs8XaNvjAV5rV0eWHIUQvwnyN50Q97iDqQeJSohiy9ktGJSBIQ2GMK35NEJ8Q8rGFB09SvrceeRs2AAGA14PDMfvkUdwatiw6gK1WNDHN5K79SM8L+4C7US0JYJjwVMY0qs7f5clRyHEb4wkYULcg8wWM9+lfMeC+AUcunIID0cPHm7xMBObTSTQ1dpCQmtN4f79pM2ZQ/7321CurvhOnYrvjOk41KxZdcGW5GM8EE3R9k/wyE8mX/syT01Ct5/O2O6tmCZLjkKI3yhJwoS4hxQYC/jyxJcsSlxESm4Kddzr8ErHVxjZeGRZc1VtsZC3dSvpc+ZS+OOP2Pn4EPDc7/CZOBE7b++qCzbnAnnbP8PuQCQu5hziLQ3Z4PZ7GvSczOPt68uSoxDiN0/+FhTiHnCl4AoxSTEsObqEnJIcWgW0Yla7WfQN6oudwXoRszYayV63nvR5cyk5cRKHOnWo8fqf8B41CoOLS5XFqs8fIGPLv/E+tQ4XbWGTpQOH602mR++hvNrYX5YchRCilCRhQtzFjmceJyohivWn1mOymMqaq7YJbFM2xlJQQNby5aR/sQDTxYs4NW1K7Q8+wHPwoKprM2ExUxK/luzvPiQg4wCO2oXFahC5bR7hgV5dGCxLjkIIcQOb/g2tlBoEfAjYAXO11u9ftz0IiAS8S8e8orXeYMuYhLjbaa3Zc3EPkfGR7LywExd7F0Y3Gc3U5lMJ8gwqG2fKzCRz0WIyFy3CnJ2Na3g4td58A7eePatutqk4l+xdX6D3/Afv4gsUWQL43PURfLvPZEzHprLkKIQQP8Nmf0MqpeyAT4H+wDlgn1JqjdY6odywPwFLtdb/UUo1BzYAwbaKSYi7mdFs5Kvkr4iMj+RY5jH8nP14tu2zjGs6Dm9n7/+Nu3CB9C8WkLV8ObqwEPc+ffCbORPXdm2rLFadmczlzR/hlRiLlyWfOEtT4mq+Qat+k3m8SaAsOQohxG2w5T9TOwIntNanAJRSscAIoHwSpoGrtz7yAi7YMB4h7ko5JTksO2ptrppamEpj78a83fVthjYciqOdY9m44uPHSZ87j+z16wHwGjYMv5mP4NS4cdUEqjXFybtJ3fQval/cjJ9WfKM6c7n5I/TrN5gnZMlRCCHuiC2TsDpASrnH54BO1415E9iklHoWcAP62TAeIe4q53LPsThxMSuOr6DQVEjnWp15q9tbdKt9bdf6ggMHSJ8zl7zvvkO5uOA7eRK+06fjULt21QRqNpK5fzmF2z6mdl48ntqVZc4jceryBAO7tpMlRyGE+IWq+2/PicACrfX/KaW6AAuVUi201pbyg5RSjwGPAQQFBVVwGCHuHYevHCYyPpLNZzdjwMDgBoOZFjaNZr7NysZorcn7/ntrm4n9+7Hz9sb/2WfwmTQJex+fKolTF2Rybst/cD80Hx/TFbIsNVkc8Dsa9X+U8SH1ZMlRCCF+JVsmYeeBeuUe1y19rrxHgEEAWuvdSilnwB9ILT9Iaz0bmA0QHh6ubRWwELZitpjZem4rUfFRHEg9gIeDB9PDpjOp2SRquv2vcao2Gsn56ivS58yl+Phx7GvXosYf/4j36FEYXKtmua/48jFSNvyTOmdWUY8i9hJGcpOX6TJoIpP93KskBiGE+C2wZRK2D2iilGqANfmaAEy6bsxZoC+wQCkVCjgDV2wYkxBVqtBUyJoTa1iYuJAzOWeo7Vablzu8zMgmI3FzcCsbZyksJGv5CtK/mI/pwkWcmjSm9t/ex3PIEJSDg+0D1Zr0+C1kbPk3jTJ3UE/b8b1jT0wdnySiVx86ypKjEEJUOpv9zaq1NimlngE2Ym0/MV9rHa+UehuI01qvAX4PzFFKPY+1SH+G1lpmusQ9L60wjdikWJYcXUJWcRYt/FrwQa8P6BfUD3vD//63M2dlkbF4MZmLFmPOzMSlXTtqvv467r16oQwGm8epTcWc3hqFw77PqVd8ArQHG3wmU6PvM/Rv0UyWHIUQwobUvZbzhIeH67i4uOoOQ4gKncw6SVRCFOtOrsNoMRJRL4LpYdNpF9jumoTGePEiGQsiyVy2DF1QgHtEBH6PzsS1ffsqibMoO5UTGz6i9rFF+OpMTlCXo8FTaT3kUeoG+lVJDEII8VuglNqvtQ6vaJusMQjxK2mt2XtpL5HxkWw/vx0nOycebPwgU5tPJdgr+JqxxSdPWttMrF0LWuM1bCi+jzyCc9OmVRLrlVMHufD1PwlJ/YoWlLDPvh0H27xP5/6jaexUBcueQgghykgSJsQvZLQY2Zi8kaj4KBIzEvF19uXpNk8zPmQ8Ps7XXsFYePAgaXPmkrdlC8rZGZ+JE/GbMR2HOnVsHqe2WDi2ew3mnZ/QvGAfHtqBvZ4D8Oj1LOHtO8uSoxBCVBNJwoS4Q7kluSw/tpzFiYu5XHCZhl4NebPLmwxrNAwnO6eycVpr8rdvJ332HAri4rDz8sL/qafwmTqlStpMFBXkceTrOfjHzyfEfJYrePN9ncdoPOR39KxT79YHEEIIYVOShAlxmy7kXWBR4iJWHl9JvjGfjjU78ucuf6Z7ne4Y1P+K6LXJRM5XX5M+dy7FR49iX7MmNV59Be8xYzC4uf3MGSrH5QtnOLn+34SeX044OZwwNGB3q3dpPegherna/vxCCCFujyRhQtxCfFo8kfGRbDqzCYBBDQYxrfk0mvs1v2acpaiIrJUryZg3H+P58zg2akSt997Da+gQlKNjRYeuNFpr4n/cSd7Wj2ibvYXOmDns1oXzXZ8mrOsQGlfBlZZCCCHujCRhQlTAoi1sO7eNBfEL2H95P+4O7kxtPpXJoZOvaa4KYM7OJjMmhoyohZgzMnBp3Zoar72Ke+/eNm8zUVRiJO6bWDx+nE1r02EKcOKnGg9SZ9DztGnYwqbnFkII8etIEiZEOUWmItacXMPChIUk5yRTy60WL4a/yOgmo3F3vLZbvPHyZTIWRJK1ZAmWggLcevXEf+ZMXMLDbV7sfiktjZ/Wf06T04vpzgWuKH9+DHmekCHPEO7lb9NzCyGEqByShAkBXMq/xNKjS1lxfAUZRRk092vO33r8jf7B/XEwXNu6ofjUKdLnzSN7zVqwWPAcPBi/mY/g3KzZTY5eObTWHEpIIHXzx3TMWEN/lc9pp2Yc7fASTSMmE2Bv2yVPIYQQlUuSMPGbpbVm36V9xB6N5duz32LRFnrV68W05tMIr3HjbFbh4cOkz5lD7uYtKEdHfMaOxffhh3CsW9emcRYZzez4fiP2ez+nW/EOWioLx3x7U9xvFg2a9wRpMSGEEPckScLEb06BsYC1J9cSkxTDyeyTeDl5MT1sOuNCxlHH/dq+XVpr8nfuIn3OHAp++AGDpyd+TzyO75Qp2PvZtrP8xcw89n4VRdCxSPqRRD6unGgwmeAhzxMa2NCm5xZCCGF7koSJ34zT2adZcnQJq0+sJs+YR6hvKH/p9hcGBQ/C2d75mrHaZCJ30ybS5s6lOCER+8BAAl96Ce9x47Bzt12bB601B46f5fQ3n9MpdRkj1BWu2NfidJs/EdzvMUKdvWx2biGEEFVLkjBxXzNbzGw7t43Yo7HsurALe4M9g4IHMaHZBFr5t7phydFSXEz2qlWkz5uPMSUFxwYNqPXuO3gOH47Bhm0mioxmtuzeR8nOz+hXtIn2qpAUj9ak9XyPgPBRBBjsbHZuIYQQ1UOSMHFfyirKYuWJlSxJWsKF/AsEugbybNtnGdVkFP4uN149aExNJWvpMjJjYzGnpeHcqhWBL/0Bj759bdpm4mJWAd99s5bA+PkM0j+glYFztQfiMOgF6tXvYLPzCiGEqH6ShIn7Snx6PLFJsXx1+iuKzcV0qNmBFzu8SO96vbE3XPtx11pTeOAAmYsXk7PpGzCZcOvZA79HZuLasYPN2kxordl/OpXDm6JofyGGSYaT5Bs8uBj6OHUGPEuwl20L/YUQQtwdJAkT97wScwmbzmwiJimGw1cO42LvwoONH2R8yHia+DS5YbyloIDsdevIXBxN8dGjGDw98Z0yBZ+JE3CsX99mcRYZzXy9L5HM7XMYWLCGcJVBumsQGZ3fx7frNNwc5ZZCQgjxWyJJmLhnXcq/xLJjy1h+bDkZRRkEewbzSsdXeKDRA3g4etwwvuTMGTJjYslauRJLTg5OISHUfPstvIYNw+DqarM4L2YXsu677Xgemsdwy3e4qmIu+XeiqO8s/JoNArmlkBBC/CZJEibuKVpr4i7HEZMUc01vr4khE+lcu/M1N9IG0BYL+du3k7F4Mfnbd4CdHZ4D+uMzeTIu7drZdMkxLjmDnVu+pMXZRTyifsSs7Mls9AAu/WdRs1Yrm5xXCCHEvUOSMHFPKDAWsO7UOmKSYjiRdQIvJy+mhU1jXNNx1PW4sYbKnJ1N1spVZMbEYDx7FrsAf/yfegrvceNwqBFosziLjGbWH0jmzLaFDMxdySzDGfIdfcht9zxePZ4g0KOGzc4thBDi3iJJmLirVdTb6+2ubzO4weAbensBFCUlkbl4Mdlr16GLinBp357AWc/h0a8fyoYtJi5mF7Ji+0EM+79gjGUjo1UWWZ6NKe75IW5tJ4DDjbEKIYT4bZMkTNx1zBYz289vJyYppqy318DggUxsNrHC3l66pIScb74hMzqGwv37Uc7OeA0fjs/kSTa9n6PWmrgzmWz87juanFrITMMOnJWRzDq90H1m4d2ot9xSSAghxE1JEibuGllFWaw6sYolR5dwPu88ga6BPNPmGUY3HV1xb6/LqWQtXUrm0iWYr6ThUK8egS+/jPeokdh52a6zfJHRzNqD5zn8/Ur6Za/gT3aHMdo7URw2AXo9i09AiM3OLYQQ4v4hSZiodgnpCcQkxVzT2+v34b8nol4EDgaHa8ZqrSncv5+MxYvJ/WYzmM249eyB7+TJuHXvbtvGqtmFxO48St6+aMab1zHWcJ4C1wBKOr+GY8eZOLjZ9l6SQggh7i+ShIlqYTQby3p7HbpyCBd7F0Y0GsGEZhPuqt5eV5ccV247QJ3ji5hm2IyfyiXPLwzd63VcW4wGe9vVmgkhhLh/SRImqtTV3l4rjq0gvSid+p71b93bKzqGrFWrrL29mjWj5l/etvb2cnGxWZxFRjNrD11g27Zvichcxtt2u7C3s1DYcAD0/B3u9btJvZcQQohfRZIwYXMV9vaq24uJzW6jt9e27WBvj+eAAfhMnmTT3l5gXXJctPs05/euZrxxDR/bJWB0dIG2D6O6PImrXyObnVsIIcRviyRhwmYq7O3VfBrjQm7S2ysr63+9vVJSsA8IwP+ZZ/AeNxaHQNv19jJbNNuOXWHFnqP4nVjBDMNXNDBcotijFrrr2zi0nw4u3jY7vxBCiN8mScJEpUvOTmbJ0SV8eeLL2+vtlZhIxuLF5Kxbb+3tFd6ewBeet/b2cnCo4AyV43xWIUv3pbB5308MLFjDO/ab8bbPo7hGO+jxV5xCR4Cd/C8ihBDCNuQbRlQKs8XMjvM7iEmKYeeFndgb7BlQfwATm02kdUDrm/f2WhxN4YEDVdbby2i28G1SKrF7z3Lm+GFm2q1nld12HOxN6JAh0O05nII62ez8QgghxFWShIlf5Rf19lqyhMxlS629vYKCCHzlZbxH2ra315n0fJbsS2HZ/nPUy/uJ3zlvoJfjPrBzRLWZDF2eRfk3ttn5hRBCiOtJEiZ+kcT0RGKSYthwegPF5mLCa4TzQvsX6B3U+67p7VVsMrMp/jKx+86y68QVBtjtJ8Z9E42d4tFO3qiOL0LHx8DddvVmQgghxM1IEiZum9Fs5Jsz3xCTFMPBKwdxsXfhgUYPMKHZBJr6NL1hvKWggOy168iMLtfba+pUa2+voCCbxXkiNY/YvWdZ+eN58vPzeNj9Bz7xWY9P4VlwCYI+f0e1nQKObjaLQQghhLgVScLELV3Ov8yyY8tYfmx5WW+vlzu8zAONH8DT0fOG8WW9vVauxJKbi1OzZtR65y94Dh1qs95eRUYz6w9fJHbfWfYlZ+JnyOeNmrsZ7LAax6J0CGgDQ98EKbYXQghxl5BvI1EhrTX7L+8nJimGLWe3YNEWetbtycRmE+lSu8uNvb3MZvK2bydzcTT528v19poyGZe2bW3W2yvxYk7ZrFdukYnOvnmsbfwdLS6vRmUUQOP+0O13ENxDmqsKIYS4q0gSJq5xfW8vT0fPW/f2WrGSzNjY//X2evYZvMfarrdXfrGJtYcuELMvhUMpWTjaG3i0cQ4PsQa/M1+hLhig5Vjo+izUaG6TGIQQQohfS5IwAcCZnDPEJsWy+sRqco25NPNtxttd32ZQg0G42N+4hFiUkEBGdDQ5a9ehi4tt3ttLa83hc9nE7jvLmoMXyC8x0zTQjf92zqRPRiwOydvB0QO6PAWdngSvOpUegxBCCFGZJAn7DSvr7XU0hp3nrb29+tfvz6Rmk27e22vTN2RGl/b2cnHBa8QIa2+vkBCbxJhdaGT1wfPE7E0h8WIOLg52jGjhz+P+Bwk+Og91MAE8akH/t6H9DHC2XZsLIYQQojJJEvYblF2czarjq4g9Gmvt7eUSyNNtnmZM0zE/39tr6VLMaWk41Ldtby+tNXFnMonZe5YNP12kyGghrLYn7w8L5kHzJpzjZkPiBQhsDg/+B1qMAXvHSo9DCCGEsCVJwn5DEtMTiT0ay/pT68t6ez3f/nn6BPWpuLdXXBwZi6PJ3Wzt7eXesyc+Uybj1q2bTXp7ZeSXsPLAOWL3pXAiNQ93J3tGtavL1OaOhJ5ZBNsXQHGOtcj+gY+gcT8pthdCCHHPkiTsPveLe3stXkzxsWMYvLzwnTbN2turXr1Kj89i0ew5lU7MvhQ2HrlEidlC2yBv/j66FcNqZeG67zNYsgy0GZo/aC22r9Ou0uMQQgghqpokYfep1IJUlh1bxrKjy0gvSifII4iXOrzEiMYjKu7tlZxMZkwMWStXWXt7hYbatLdXam4Ry/efY8m+FM6kF+Dl4sCkTkFM7FCPkKKDsPN5WP8NOLhC+MPWgnuf4EqPQwghhKgukoTdR6729oo9GsuWM1swazM96/ZkQrMJdK3dteLeXtu2kRkd87/eXgMH4jN5kk16e5ktmm3HrxC79yxbElMxWTSdGvjyfL+mDGruj/Px9bD2abjwI7gFQO8/QYdHwNW3UuMQQggh7gY2TcKUUoOADwE7YK7W+v3rtv8L6F360BUI1Fp72zKm+1GBsYD1p9cTkxTD8czjeDp6MqX5FMaFjKOex41LiGW9vWJiMJ47Z/PeXheyClkal8KyuHOczyrEz82RR7o3YHyHejT0UvDjYvjPJ5B1Bvwaw7B/Q+sJ4GCb7vpCCCHE3cBmSZhSyg74FOgPnAP2KaXWaK0Tro7RWj9fbvyzQFtbxXM/OptzltijsXx5/Muy3l5vdX2LwQ0G31ZvL9fwcAJf/D0efftWem8vo9nCt0mpxO49y/fHrmDR0KOJP38cGkq/0BrWWwnt/RD2zYHCTKjXCQb+FUKGgI1u6C2EEELcTWw5E9YROKG1PgWglIoFRgAJNxk/EXjDhvHcFyzawo7zO4hOirb29lL29A++jd5eixdT+OOP1t5eDz6Iz6RJOIfcWJj/a51NLyB231mW7T/Hldxiang68VREY8Z3qEc9X1dIPwlfvQCHYsBUbE26uv0OgjpXeixCCCHE3cyWSVgdIKXc43NAp4oGKqXqAw2Ab20Yzz0tuzibL098SWxSLOfyzhHoEshTbZ5iTJMxBLgG3DDeePlyaW+vZWW9vWq8+gpeI0di53ljYf6vUWwy803CZWL3prDjRBoGBb1DApnQMYjeIQHY2xkgZS9s+hCS1oOdo3W5seuz4N+kUmMRQggh7hV3S2H+BGC51tpc0Ual1GPAYwBBQUFVGVe1S8pIIiYphg2nNlBkLqJ9jfbMaj/r1r29vvkGLBbce/XCZ/Ikm/T2OpGax5J9Z1lx4DwZ+SXU8Xbhhf5NGRtel1peLmCxwLGvYOdHkLIHnL2hx++h0+Pgbpv7SgohhBD3ClsmYeeB8lXhdUufq8gE4OmbHUhrPRuYDRAeHq4rK8C7ldFsZPPZzcQkxfBj6o+42LswrNEwJoRMIMT3xtsDWfLz/9fb6/hxa2+v6dNt0turyGhmw08Xid2bwt7kDOwNiv7NazChYxDdG/tjZ1BgLIL9C2DXJ5B+HLyCYNDfoO0UcHKv1HiEEEKIe5Utk7B9QBOlVAOsydcEYNL1g5RSzQAfYLcNY7knXO3ttfzYctIK0+6q3l6JF3OI3XuWVT+eJ6fIRLCfK68MbsbodnUJ8HCyDirIgLh58MNsyE+FWq1h9Dxrk1W7u2XSVQghhLg72OybUWttUko9A2zE2qJivtY6Xin1NhCntV5TOnQCEKu1vu9nuCqiteZA6gFikmLKenv1qNuDic0m/nxvr8XR5O/YAQ4OeA4YgM/kybi0bVOpvb3yi02sPXSBmH0pHErJwtHOwOCWNZnQIYjODX3/d66ss7D7MzgQBcZ86+2Euv4OGvSU2woJIYQQN6HutdwnPDxcx8XFVXcYv1qBsYANpzcQkxTDscxjeDp6MqrJqJv29jJlZpK9ciWZMbHW3l6BgXhPGI/P2LHYB9xYmP9Laa05fC6b2H1nWXPwAvklZpoEujOhYxCj2tbBx63cjbIvHrLWe8WvsiZbLcdai+1rhFVaPEIIIcS9TCm1X2sdXtE2WSOqYmW9vU58SW5JLiE+IT/b26swPp7M6Ghy1q239vbq0MEmvb1yioys/vE80XtTSLyYg7ODgWGtajOxYz3aBfn8b9ZLazi5xZp8nf4eHD2g85PWP151Ky0eIYQQ4n4nSVgVuNrbKyYphh3nd1h7e9Xvz8TQibQJuHEJUZeUkLNxk7W318GDNuvtpbVm/5lMYvamsP6nCxQZLYTV9uQvD7ZgRJvaeDqXS/LMRjiyAnZ9DJePgEct6PcWhD8Ezl6VFpMQQgjxWyFJmA1d39srwCXgjnp7Odavb5PeXpn5Jaw4YL159vHUPNyd7BnVri4TOwTRsu51CVVRDhyIhD3/gZzzEBAKIz6zLj3aO1Z8AiGEEELckiRhNnA04ygxSTGsP7WeInMR7QLb8Vz75+gb1LfC3l4F+/aRuTia3M2by/X2moxbt66V1tvLYtHsOZVOzL4UNh65RInZQtsgb/4+uhVDW9XCzem6j0LORfjhPxC3AIqzIbiH9Z6OTfpLsb0QQghRCSQJqyRGs5EtZ7cQkxTDgdQDd01vr9TcIpbvt856nUkvwNPZnkmdgpjQsR7NalYwu5aaaF1yPLwUtBmaj7Be6VinXaXFJIQQQghJwn611IJUlh9bzrJjy0grTKOeRz3+EP4HRjQegZfTjbVSxadPkxkTQ/bKVVjy8nBqHkqtd9/Bc8iQSuvtZbZoth+/Qszes2xJTMVk0XRs4Musfk0Y3KIWzg521+6gNSTvgF0fwfFNYO9irfXq/BT4NqiUmIQQQghxLUnCfgGtNT+m/khMUgybz2y+vd5e328jc/Fi8nfutPb2GjgQn8mTcGlTeb29LmYXsnTfOZbGpXA+qxBfN0ce7t6A8R3q0Siggk71FjMkrrFe6XjhALj6Q+8/QoeZ4OpbKTEJIYQQomKShN2BQlMhG05Ze3sdzTyKh6MHk0MnMz5kPPU8f6a3V3QMxvPnsQ8MxP93z1Zqby+T2cK3SanE7kth69FULBp6NPHntSGh9G9eA0f7CmrKSgrg4GLY/QlkJoNvQxj2L2g9ERwqt9O+EEIIISomSdhtSMlJIfZoLKtOrCrr7fVmlzcZ0nDIzXt7LY4mZ3253l5/+AMefftUWm+vlIwCYvedZVncOVJziwn0cOKpiMaM71CPer6uFe+UnwZ7Z8PeOVCYAXU7QP+/QLOhYLCreB8hhBBC2IQkYTdh0RZ2nt9Z1tvLTtlVe2+vEpOFTQmXiN2bwo4TaRgU9A4JZELHIHqHBGBvd5MrKdNPWme9DkaDqQhChliL7YM6y5WOQgghRDWRJOw6OSU5rDq+iiVHl5CSm0KASwBPtn6SMU1v0tvr0iUylywha9ny//X2eu1VvB58sNJ6e528kkfs3rOsOHCejPwS6ni78Hy/pozrUJdaXj+zfJiyD3Z9CInrwM4BWk+ALs9CQOU1fBVCCCHELyNJ2HXO557nH3H/oF1gO37X7nc37+21dx+Z0eV6e0VE4DNpUqX19ioymtnw00Vi96Ww93QG9gZFv9AaTOhYjx5NArAz3GQGy2KBY19br3Q8u9vazb7HC9DxcfCo8avjEkIIIUTlkCTsOqF+oax9cC3BXsE3bLP29lpb2tvrBHZeXvjOmI7PxIk41q2c+yYmXcohdm8KKw+cI6fIRLCfKy8PasaY9nUJ8HC6+Y7GIji8xLrsmHYMvIJg0PvQdio4VXBlpBBCCCGqlSRhFbg+ASs+Vdrba1X53l7v4jl0CAZn5199vvxiE+sOXyBmbwoHU7JwtDMwqEVNJnSsR+cGfhhuNusFUJgJcfNhz+eQnwo1W8HoedD8QbCTX68QQghxt5Jv6ZuwdW8vrTU/nc8mZm8Kaw6eJ7/ETJNAd14f1pxRbevg43aL+zJmnbXez3F/JBjzoVFf6PY7aNBLiu2FEEKIe4AkYdcxZ2eTtWwZmTGx1t5eNWoQ8Nzv8B47Fnt//199/JwiI6t/PE/M3hQSLubg7GBgaMvaTOpUj3ZBPrdO7i4ettZ7HVlpTbZajIauz0LNlr86NiGEEEJUHUnCrmO8eJHUf/xfpfb20lqz/0wmMXtTWP/TBYqMFprX8uQvI8J4oE0dvFxucXyt4eS31uTr1FZwdIfOT1r/eFVOLZoQQgghqpYkYddxbtaMRps24hgU9KuPlZlfwsofzxO79yzHU/Nwc7RjZNu6TOxYj5Z1vG4962U2Wme8dn0Ml38C95rQ701o/xC4eP/q+IQQQghRfSQJq8CvScC01uw+lU7s3hS+PnKJErOFNvW8+dvolgxrVRs3p9t4y4tzrbVee/4DOecgoBmM+BRajgX7n7lCUgghhBD3DEnCKsmV3GKW7z/Hkn1nSU4vwNPZnkmdgpjQsR7Nat5m09aci/DD5xD3BRRnQ/3uMOyf0Lg/VELvMSGEEELcPSQJ+xXMFs3241eI3ZvC5sTLmCyajsG+PNevCYNb1MLZ4Tbvx5iaZF1yPLwEtBlCH7Be6VinvW1fgBBCCCGqjSRhv8DF7EKW7jvH0rgUzmcV4uvmyEPdghnfIYjGgbfZGFVrOLPLWmx/7Guwd4H2M6DLU+Db0KbxCyGEEKL6SRJ2m0xmC98mpRK7L4WtR1OxaOje2J9XhzSjf/MaONnf5qyXxQyJa63J1/n94OoHEa9Ch0fBzc+2L0IIIYQQdw1Jwm4hJaOAJftSWBqXQmpuMYEeTjwZ0Yjx4UEE+bne/oFKCuDgYtj9KWSeBp8GMPT/oPUkcLyD4wghhBDiviBJWAVKTBa+SbhM7L6zbD+ehkFBREggEzrUo0+zQOzt7qBIPj8N9s6BfXOgIB3qhEP/t6DZMDDc5uyZEEIIIe47koRd51BKFg8v2Ed6fgl1vF14vl9TxobXpba3y50dKP2kddbr4GIwFUHTwdZi+6AuclshIYQQQkgSdr0mNdzp1tifUe3q0KNJAHY/d/PsipyLg50fWuu+7Byg1XjrbYUCQmwTsBBCCCHuSZKEXcfV0Z6PJra9s50sFji+EXZ+BGd3gbMXdH8eOj0OHjVtE6gQQggh7mmShP0apmJrb69dn0DaUfCqBwPfg3ZTwcmjuqMTQgghxF1MkrBfojAL4uZbu9vnXYYaLWHUHAgbaV2CFEIIIYS4BUnC7kRWivV+jgcioSQPGvaGkZ9b/yvF9kIIIYS4A5KE3Y5LP1nrvY6ssD5uMdpabF+rVfXGJYQQQoh7liRhN6M1nPrOmnyd+g4c3aHTE9D5SfCuV93RCSGEEOIeJ0nY9cxGiF9lva3QpZ/AvQb0fQPCHwYX7+qOTgghhBD3CUnCrnd+P6x8FPxD4IFPoNU4sHeq7qiEEEIIcZ+RJOx69TrB9HVQvxsY7uD2REIIIYQQd0CSsOspBQ16VHcUQgghhLjPyVSPEEIIIUQ1kCRMCCGEEKIaSBImhBBCCFENJAkTQgghhKgGNk3ClFKDlFJHlVInlFKv3GTMOKVUglIqXikVbct4hBBCCCHuFja7OlIpZQd8CvQHzgH7lFJrtNYJ5cY0AV4FummtM5VSgbaKRwghhBDibmLLmbCOwAmt9SmtdQkQC4y4bsyjwKda60wArXWqDeMRQgghhLhr2DIJqwOklHt8rvS58poCTZVSO5VSe5RSg2wYjxBCCCHEXaO6m7XaA02ACKAusE0p1VJrnVV+kFLqMeAxgKCgoCoOUQghhBCi8tlyJuw8UK/c47qlz5V3DlijtTZqrU8Dx7AmZdfQWs/WWodrrcMDAgJsFrAQQgghRFVRWmvbHFgpe6xJVV+sydc+YJLWOr7cmEHARK319P9v735j5KrKOI5/f6mt1ra2iWhtBF0TGowQXYqWvzYVYhWoJYYm7QuVIibWiNR/EPBFLbxRY4IoMaC0IiiElUVIQf60xpqUCNX+0wLlBYGiJZgWkNJq06bL44t7FqfjzsxVmDk79/4+yWRn7j1z7/P0bM6e3ntmHknHANuAwYh4sc1x9wLPdiXoox0DvNCD84xHzr2+6px/nXOHeufv3OurF/m/NyLGvILUtduREXFE0qXAQ8AE4GcR8bika4DNEbE27Vsg6QlgBLi83QQsHbcnl8IkbY6ID/fiXOONc69n7lDv/OucO9Q7f+dez9whf/5dXRMWEfcD9zdtW9nwPICvp4eZmZlZbfgb883MzMwy8CSstZ/mDiAj515fdc6/zrlDvfN37vWVNf+uLcw3MzMzs9Z8JczMzMwsg9pOwiQdJ2lDQ/HwFWO0kaQfpQLkf5E0J0es3VAy//mS9knanh4rxzpWv5H0Fkl/lPTnlPvVY7R5s6Sh1PebJA1kCLUrSua/TNLehr7/Qo5Yu0XSBEnbJN03xr7K9j10zL3q/b5L0o6U2+Yx9ld5zO+UeyXH+1GSZkgalvSkpJ2STm/an6Xvc39jfk5HgG9ExFZJ04AtktY3FhgHzqX48tjZwKnADelnFZTJH2BjRCzMEF83HQLOjogDkiYCD0t6ICIebWhzCfCPiDhe0lLge8CSHMF2QZn8AYYi4tIM8fXCCmAn8LYx9lW576F97lDtfgf4WES0+l6oKo/50D53qOZ4P+qHwIMRsVjSJOCtTfuz9H1tr4RFxPMRsTU9308xKDXXtrwAuDUKjwIzJM3qcahdUTL/Skr9eSC9nJgezYsjLwBuSc+HgXMkqUchdlXJ/CtL0rHA+cDqFk0q2/clcq+7yo75dSZpOjAPWAMQEYebyyOSqe9rOwlrlG43nAxsatpVpgh532uTP8Dp6bbVA5JO7G1k3ZNuyWwH9gDrI6Jl30fEEWAf8PaeBtlFJfIHuDBdlh+WdNwY+/vVdcAVwKst9le576+jfe5Q3X6H4j8b6yRtUVGTuFmVx/xOuUNFx3vgfcBe4OZ0K361pClNbbL0fe0nYZKmAncBX42IV3LH02sd8t9KUW7hQ8D1wD09Dq9rImIkIgYpaprOlXRS5pB6qkT+9wIDEfFBYD3/uTLU1yQtBPZExJbcsfRaydwr2e8NzoqIORS3nr4saV7ugHqoU+6VHe8pll7NAW6IiJOBfwJX5g2pUOtJWFoPcxdwW0T8eowmZYqQ961O+UfEK6O3rVL1g4kqanxWRrokvQH4ZNOu1/peRR3U6UDbklr9qFX+EfFiRBxKL1cDp/Q4tG45E1gkaRdwB3C2pF82talq33fMvcL9DkBEPJd+7gHuBuY2NansmN8p94qP97uB3Q1X/IcpJmWNsvR9bSdhaY3HGmBnRFzbotla4HPpUxOnAfsi4vmeBdlFZfKX9K7RtTCS5lL8vvT9HyNJ75A0Iz2fDHwceLKp2VrgovR8MfC7qMiX6pXJv2ktxCKKNYN9LyKuiohjI2IAWErRr59palbJvi+Te1X7HUDSlPQhJNKtqAXAY03NKjnml8m9quM9QET8HfibpBPSpnOA5g+hZen7On868kzgs8COtDYG4FvAewAi4kaKupfnAU8B/wIu7n2YXVMm/8XAlyQdAQ4CS6vwxwiYBdwiaQLFQPOriLhPRxeXXwP8QtJTwEsUf7Sqokz+l0laRPEp2peAZdmi7YEa9f1/qVG/zwTuTvOMNwG3R8SDkpZD5cf8MrlXdbwf9RXgtvTJyKeBi8dD3/sb883MzMwyqO3tSDMzM7OcPAkzMzMzy8CTMDMzM7MMPAkzMzMzy8CTMDMzM7MMPAkzs3FL0oik7ZIek3SnpOaiu41t50s6o+H1zyUtLnGOA53a/K8kDUo6r+H1KknffKPPY2b9zZMwMxvPDkbEYEScBBwGlrdpOx84o83+Xhqk+M4hM7OWPAkzs36xEThe0qckbUqFeH8raWYqQr8c+Fq6cvbR9J55kv4g6emSV8Uul/SnVMD66rRtQNJOSTdJelzSulRpAEkfSW23S/p+umI3CbgGWJK2L0mH/4Ck36dYLnuj/3HMrP94EmZm416q4XgusAN4GDgtFeK9A7giInYBNwI/SFfONqa3zgLOAhYC3+1wjgXAbIqaeoPAKQ1FjmcDP46IE4GXgQvT9puBL6Zi6CMAEXEYWAkMpViGUtv3A59Ix/92qt1qZjVW57JFZjb+TW4oq7WRoqTQCcBQqnM4CXimzfvviYhXgSckzexwrgXpsS29nkox+for8ExEjMaxBRhI9TenRcQjafvtFJO9Vn6TimMfkrSHopTM7g4xmVmFeRJmZuPZwXSV6TWSrgeujYi1kuYDq9q8/1DjWzucS8B3IuInTecbaDrOCDC5w7E6xTKCx1+z2vPtSDPrN9OB59Lzixq27wemvY7jPgR8XtJUAEnvlvTOVo0j4mVgv6RT06bGQt+vNxYzqwFPwsys36wC7pS0BXihYfu9wKebFuaXFhHrKG4pPiJpBzBM54nUJcBN6ZbpFGBf2r6BYiF+48J8M7OjKCJyx2Bm1pckTY2IA+n5lcCsiFiROSwz6xNek2Bm9v87X9JVFGPps8CyvOGYWT/xlTAzMzOzDLwmzMzMzCwDT8LMzMzMMvAkzMzMzCwDT8LMzMzMMvAkzMzMzCwDT8LMzMzMMvg3QJhXpIQLGN8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "seed_idx = list(range(2,max_depth +1))\n",
+    "\n",
+    "plt.figure(figsize=(10,5))\n",
+    "\n",
+    "for i in range(len(data)):\n",
+    "    plt.plot(seed_idx, time_algo_cu[i], label = names[i])\n",
+    "\n",
+    "\n",
+    "plt.title(f'Runtime vs. Path Length ({num_seeds} Seeds)')\n",
+    "plt.xlabel('Path length')\n",
+    "plt.ylabel('Runtime')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12979"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del time_algo_cu\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test 2: Runtime versus number of seeds\n",
+    "The number of seeds will be increased over a range in increments of 10.  \n",
+    "The runtime will be the sum of runtime per increment.  Increaing number of seeds by 1 would make for very long execution times "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading ./data/preferentialAttachment.mtx...\n",
+      "\t100,000 nodes, 499,985 edges\n",
+      "\t.................................................................................................... \n",
+      "Reading ./data/dblp-2010.mtx...\n",
+      "\t326,183 nodes, 807,700 edges\n",
+      "\t.................................................................................................... \n",
+      "Reading ./data/coPapersCiteseer.mtx...\n",
+      "\t434,102 nodes, 16,036,720 edges\n",
+      "\t.................................................................................................... \n",
+      "Reading ./data/as-Skitter.mtx...\n",
+      "\t1,696,415 nodes, 11,095,298 edges\n",
+      "\t.................................................................................................... \n"
+     ]
+    }
+   ],
+   "source": [
+    "# some parameters\n",
+    "rw_depth = 4\n",
+    "max_seeds = 1000\n",
+    "\n",
+    "# arrays to capture performance gains\n",
+    "names = []\n",
+    "\n",
+    "# Two dimension data\n",
+    "time_algo_cu = []       # will be two dimensional\n",
+    "\n",
+    "i = 0\n",
+    "for k,v in data.items():\n",
+    "    time_algo_cu.append([])\n",
+    "    \n",
+    "    # Saved the file Name\n",
+    "    names.append(k)\n",
+    "\n",
+    "    # read data\n",
+    "    G = read_and_create(v)\n",
+    "        \n",
+    "    num_nodes = G.number_of_nodes()\n",
+    "    nodes = G.nodes().to_array().tolist()\n",
+    "    \n",
+    "    print('\\t', end='')\n",
+    "    for j in range (10, max_seeds +1, 10) :\n",
+    "        print('.', end='')\n",
+    "        seeds = random.sample(nodes, j+1)\n",
+    "        t = run_rw(G, seeds, rw_depth)\n",
+    "        time_algo_cu[i].append(t)\n",
+    "\n",
+    "    # update i\n",
+    "    i = i + 1\n",
+    "    print(\" \")\n",
+    "    \n",
+    "    del G\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmcAAAFNCAYAAABFbcjcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAByJUlEQVR4nO3dd3gU1dvG8e/ZFFJJp/cOoQQIHaRI71WQInbAiqgodn3t+rNhQUTEBgoICkjvvQQIvYQSIKETCAmpu/u8f2SJCR1N2BCez3VxsTtzzsw5O9lwM3PmjBERlFJKKaVU3mBxdgOUUkoppdQ/NJwppZRSSuUhGs6UUkoppfIQDWdKKaWUUnmIhjOllFJKqTxEw5lSSimlVB6i4UwpddOMMaWMMYnGGBdntyUvMsaUMcaIMcbVSftvYoyJchyj7s5og6Md9xtjVjpr/0rdrjScKZVPGGOijTHJjn+QjxtjJhhjfHJw260vvheRwyLiIyK2nNh+bnOEBDHGjLxkeYwxpoVzWpWr3gK+dByjPy9daYxpaoxZbYyJN8bEGWNWGWPq3fpmKqWuRMOZUvlLFxHxAcKA2sAo5zYnT4kDRhpjfJ3dkJvxL8++lQZ2XGV7BYFZwGggECgOvAmk/ts2KqVyloYzpfIhETkOzCMjpGGMaWGMiclaJuvZMGPMG8aYycaYn4wxCcaYHcaYcMe6n4FSwEzHWbmRl162M8YsNca87Tgbk2iMmWmMCTLG/GqMOW+M2WCMKZNl31WMMQscZ232GGPuuVI/jDF9jTERlyx7xhgzw/G6ozFmp6PNscaY567xsewC1gAjrrKvCcaYt7O8z/aZOT6v540xW40xF4wx3xtjChtj5jj2v9AYE3DJZh80xhw1xhzL2jZjjMUY86IxZr8x5ozjsw90rLv42T5kjDkMLL5Kex8xxuxzfIYzjDHFHMv3A+X453gVuKRqJQARmSQiNhFJFpH5IrI1y7YfNMbsMsacNcbMM8aUzrLuqsfOccxnOI75eqB8lnXGGPOpMeakY/02Y0z1K/VNqTudhjOl8iFjTAmgA7DvJqp1BX4D/IEZwJcAIjIIOIzjrJyIfHiV+v2AQWSciSlPRhD6gYyzM7uA1x1t8wYWABOBQo56Xxtjql1hmzOBysaYilmW9XfUBfgeGCIivkB1rhJksngVGH4xCP0LvYA2ZAScLsAc4CUghIzfp09dUr4lUBFoC7yQ5dLwk0B3oDlQDDgLfHVJ3eZAVaDdpY0wxrQC3gPuAYoCh8g4dohIebIfr0vPiO0FbMaYH40xHS4NlMaYbo4+9XT0awUwybHuesfuKyDF0aYHHX8uagvc5fjs/BxtP3Np35RSGs6Uym/+NMYkAEeAkzgC0Q1aKSKzHePIfgZq3eS+fxCR/SIST0Zo2S8iC0XECkwh4zIrQGcgWkR+EBGriGwG/gD6XLpBEUkC/gLuBXCEtCpkhEeAdKCaMaagiJwVkU3XaqCIRJIRLl64yb5dNFpETohILBmhZZ2IbBaRFGB6lj5e9KaIXBCRbWQE1Xsdy4cCL4tIjCM8vQH0vuQS5huOuslXaMcAYLyIbHLUHwU0ynp28mpE5DzQFBDgO+CU42xX4Sxte09EdjmO3btAmOPs2VWPncm4OaQX8Jqj3duBH7PsOh3wJeP4Gcf2j12vvUrdiTScKZW/dHecRWpBxj+CwTdR93iW10mAx02OdzqR5XXyFd5fvDmhNNDAGHPu4h8ywkaRq2x3Iv+Emv7An47QBhlhoCNwyBizzBjT6Aba+RowLEsYuRk32seLjmR5fYiMs2SQ8RlMz9L/XYANKHyVupcq5tgeACKSSMZZqOLX7wI4gtH9IlKCjDOOxYDPsrTt8yxtiwOMY9vXOnYhgOsV+nxxn4vJOBv7FXDSGDPWZIx/U0pdQsOZUvmQiCwDJgAfOxZdALwurnec5Qi5mU3mWOMy/vFeJiL+Wf74iMiwq5RfAIQYY8LICGkXL2kiIhtEpBsZl9j+BCZfb+cishuYBrx8yapsnxFXD4s3o2SW16WAo47XR4AOl3wGHo4zcplNvcZ2j5IRlIDMy41BQOxVa1yF4/OYQEZIu9i2IZe0zVNEVnPtY3cKsF6hz1n39YWI1AWqkXF58/mbba9SdwINZ0rlX58BbYwxtcgYZ+RhjOlkjHEDXgEuHSh+LSfIGGSeE2YBlYwxg4wxbo4/9YwxVa9UWETSybgs+hEZ49cWABhj3I0xA4wxfo4y5wH7DbbhTeABMsbXXRQJdDTGBBpjigDDb75rl3nVGONljAl17O93x/IxwDsXB9obY0IcY71u1CTgAWNMmGPA/7tkXGKNvl5Fx4D+Zx3jEjHGlCQj9K7N0rZRjjZjjPEzxly85HzVY+e4HD4NeMPR52rA4Cz7rWeMaeD4+btAxti0Gz1eSt1RNJwplU+JyCngJzLGAMUDjwHjyDi7cgGIuUb1S70HvOK4lHWtOyJvpF0JZAwO70fGGaDjwAdcOyxOBFoDUxzjoC4aBEQbY86TMVZqwA224SAZ4+q8syz+GdgCRAPz+SdI/RfLyLgpYxHwsYjMdyz/nIxxc/MdYwTXAg1udKMispCMmxv+AI6RcQNGvxusnuDY1zpjzAXHvrcDzzq2PZ2M4/Gb43PdTsbNJTdy7J4g49LucTLOxv2QZb8FyRjjdpaMy51nyAjcSqlLGJGcvFqhlFJKKaX+Cz1zppRSSimVh2g4U0oppZTKQzScKaWUUkrlIRrOlFJKKaXyEA1nSimllFJ5yM3M/p3nBQcHS5kyZZzdDKWUUkqp69q4ceNpEblsQvB8Fc7KlClDRESEs5uhlFJKKXVdxphDV1qulzWVUkoppfIQDWdKKaWUUnmIhjOllFJKqTwkX405u5L09HRiYmJISUlxdlNUPuPh4UGJEiVwc3NzdlOUUkrlI/k+nMXExODr60uZMmUwxji7OSqfEBHOnDlDTEwMZcuWdXZzlFJK5SP5/rJmSkoKQUFBGsxUjjLGEBQUpGdklVJK5bhcO3NmjBkPdAZOikj1K6x/HhiQpR1VgRARiTPGRAMJgA2wikj4f2zLf6mu1BXpz5VSSqnckJtnziYA7a+2UkQ+EpEwEQkDRgHLRCQuS5GWjvX/KZjd7lasWEFoaChhYWEkJyfn2n4mTJjA0aNHM98//PDD7Ny585p1WrRokW1eucjISIwxzJ07N3NZdHQ0EydOzFZm9uzZ/7qdZcqU4fTp0/+6/s36r+1VSimlblauhTMRWQ7EXbdghnuBSbnVlrzOZrNddd2vv/7KqFGjiIyMxNPT87rbEhHsdvtNt+HScDZu3DiqVat2U9uYNGkSTZs2ZdKkfw5lToezW+12a69SSqnbn9PHnBljvMg4w/ZHlsUCzDfGbDTGPOqcluWM6OhoqlSpwoABA6hatSq9e/cmKSmJMmXK8MILL1CnTh2mTJnC/PnzadSoEXXq1KFPnz4kJiYybtw4Jk+ezKuvvsqAARlXgD/66CPq1atHzZo1ef311zP3UblyZe677z6qV6/OkSNHrlquatWqPPLII4SGhtK2bVuSk5OZOnUqERERDBgwIPMMXdazYsOGDSM8PJzQ0NDMbV1KRJgyZQoTJkxgwYIFmWOxXnzxRVasWEFYWBgffPABr732Gr///jthYWH8/vvvrF+/nkaNGlG7dm0aN27Mnj17gIzA+txzz1G9enVq1qzJ6NGjM/c1evRo6tSpQ40aNdi9ezcAb7zxBoMHD6ZZs2aULl2aadOmMXLkSGrUqEH79u1JT08HYOPGjTRv3py6devSrl07jh07BmScBXzhhReoX78+lSpVYsWKFaSlpV3WXqWUUvnbwfiD/LHDyb/vRSTX/gBlgO3XKdMXmHnJsuKOvwsBW4C7rlH/USACiChVqpRcaufOnZctu5UOHjwogKxcuVJERB544AH56KOPpHTp0vLBBx+IiMipU6ekWbNmkpiYKCIi77//vrz55psiIjJ48GCZMmWKiIjMmzdPHnnkEbHb7WKz2aRTp06ybNkyOXjwoBhjZM2aNdct5+LiIps3bxYRkT59+sjPP/8sIiLNmzeXDRs2ZLY76/szZ86IiIjVapXmzZvLli1bLiuzcuVKadWqlYiI3HvvvTJ16lQREVmyZIl06tQpc7s//PCDPP7445nv4+PjJT09XUREFixYID179hQRka+//lp69eqVue5iG0qXLi1ffPGFiIh89dVX8tBDD4mIyOuvvy5NmjSRtLQ0iYyMFE9PT5k9e7aIiHTv3l2mT58uaWlp0qhRIzl58qSIiPz222/ywAMPZPZlxIgRIiLy999/y913333F9l7K2T9fSimlckbkyUh5buYw+V//ajKvWTWJT4zL9X0CEXKFbJMXptLoxyWXNEUk1vH3SWPMdKA+sPxKlUVkLDAWIDw8XK61ozdn7mDn0fM50eZM1YoV5PUuodcsU7JkSZo0aQLAwIED+eKLLwDo27cvAGvXrmXnzp2ZZdLS0mjUqNFl25k/fz7z58+ndu3aACQmJhIVFUWpUqUoXbo0DRs2vG65smXLEhYWBkDdunWJjo6+bh8nT57M2LFjsVqtHDt2jJ07d1KzZs1sZSZNmkS/fv0A6NevHz/99BO9evW67rbj4+MZPHgwUVFRGGMyz3AtXLiQoUOH4uqa8SMaGBiYWadnz56Z7Z82bVrm8g4dOuDm5kaNGjWw2Wy0b58x5LFGjRpER0ezZ88etm/fTps2bYCMs3NFixa94nZv5HNRSil1exMRVsSu4Iet3+O3IIL+ywSfZPDq0wMf3J3WLqeGM2OMH9AcGJhlmTdgEZEEx+u2wFtOamKOuPSuvovvvb29gYwfjjZt2mQbq3UlIsKoUaMYMmRItuXR0dGZ27peuQIFCmS+d3Fxue5NBgcPHuTjjz9mw4YNBAQEcP/99182fYTNZuOPP/7gr7/+4p133smcAywhIeGa2wZ49dVXadmyJdOnTyc6OpoWLVpct87FPri4uGC1Wi9bbrFYcHNzy/ycLRYLVqsVESE0NJQ1a9bc1HaVUkrlLyLC8pjlfBn5Jfatu3h0kYVSR+0UqFObYq++ikfVqk5tX25OpTEJaAEEG2NigNcBNwARGeMo1gOYLyIXslQtDEx3/MPqCkwUkbnkgOud4cothw8fZs2aNTRq1IiJEyfStGlTNm/enLm+YcOGPP744+zbt48KFSpw4cIFYmNjqVSpUrbttGvXLnP8mY+PD7GxsVecnf5Gy2Xl6+t7xTB1/vx5vL298fPz48SJE8yZM+eyALVo0SJq1qzJvHnzMpcNHjyY6dOnExoamm27l+4nPj6e4sWLAxk3JVzUpk0bvv32W1q2bImrqytxcXHZzp79G5UrV+bUqVOZxyI9PZ29e/cSGnr1n4urfS5KKaVuPyLC2mNr+XLzl5zas4X713pQe6sNl8JBFP54JAU7dcwT0yTl5t2a94pIURFxE5ESIvK9iIzJEswQkQki0u+SegdEpJbjT6iIvJNbbbxVKleuzFdffUXVqlU5e/Ysw4YNy7Y+JCSECRMmcO+991KzZk0aNWqUOdA9q7Zt29K/f38aNWpEjRo16N279xWDw42Wy+r+++9n6NChl03ZUatWLWrXrk2VKlXo379/5qXXrCZNmkSPHj2yLevVqxeTJk2iZs2auLi4UKtWLT799FNatmzJzp07MwfYjxw5klGjRlG7du1sZ6sefvhhSpUqRc2aNalVq1a2Oz7/LXd3d6ZOncoLL7xArVq1CAsLY/Xq1desc2l7lVJK3Z42ntjIg/Me5NXfH6Htz7v5bJydOlE2goYOocLs2fh17pQnghmAyRiPlj+Eh4dL1nm3AHbt2kVVJ56ejI6OpnPnzmzfvt1pbVC5x9k/X0oppa5ORFhzbA1jt47l8J4I+q9zp3FkChY3dwL69yfo4YdwDQrKLJ9mtTNj6Wri96zgwcdezPWwZozZKFeYzzUv3BCglFJKKZVjRISlR5by3bbvOLtrK303uFN/mx3jYiVgwECCHnkYt0KFspVftmYtyYs/pHv6UqwWd87HP4qff9DVd5KLNJzlsjJlyuhZM6WUUuoWSLYm8/eBv/l116+YHVH0W+9Ojd02jKcQMGgQgQ88gFuRItnqbI9cx5k579IsZRlW48axyoMo0ekFPPycE8xAw5lSSimlbnOxibH8vvt3/to1lWpb43l0uwdlDtqwFHQj8PEHCRg4ANeAgH8q2O0ciphN3NKvqHVhDSmmAFHlB1Ox+yhKFixy9R3dIhrOlFJKKXXbsdqtrIpdxR9Rf7B/8xLujhQ+22nB44Idt5JBBLzwJAH39MGSZaopks9xasUP2DeMo3R6DL74srHUA4T2epEq/oWd15lLaDhTSiml1G3jyPkjTN83nbk7plNh80nabrNQ/ogV3Nwo2KY1/n364NWgAcaSZUKKhOPEz38Pj+2/ESIpbJZKbKz0Jk26PEy9gj7O68xVaDhTSimlVJ639+xePlz/Aec2rKPVVuG93Qb3NDtu5coQ8EIf/Lp3y37pEiD5LGfnf4R35Di87FZmyF3E1xxM9w6dqO3tvCcAXI+Gs1vsjTfewMfHh1mzZvHxxx8THp79DtoJEyYQERHBl19+6aQWKqWUUnmH1W7lh+0/MG/2lzz2t42ip2zg5Yl/t0749+qFR61al095kXaB0ws/xyviS/xsScySJsSEDadPm7sI8S1w5R3lIRrOlFJKKZUn7T+3n5dXvESJ+dt5Y7HgVrgIhd99koLt22Hx8rq8QuIpji0cjc+2Hwm2nWOx1OVAzeF0b9eOYJ+8H8ou0nB2C7zzzjv8+OOPFCpUiJIlS1K3bl0Afv75Zx5++GGsVivjx4+nfv362erdf//9eHh4EBERwfnz5/nkk0/o3LmzM7qglFJK3TJptjR+2fUL360bzdA5dupvt+PTogXFPngfFz+/y8pbj+3g6Nz/UeTQDIqSzlLqEBM6jI4du9MqD1++vBoNZ7ls48aN/Pbbb0RGRmK1WqlTp05mOEtKSiIyMpLly5fz4IMPXnE+tOjoaNavX8/+/ftp2bIl+/btw8PD41Z3QymllMp1p5NPM2XPFH7f8zseMaf530wPAk6mEfLMMwQ98nD2Qf5AwsGNnJ31GqXOrCRE3Jnjdje2+kNp27wZLQrcvhHn9m35vzHnRTi+LWe3WaQGdHj/qqtXrFhBjx498HKcfu3atWvmunvvvReAu+66i/Pnz3Pu3LnL6t9zzz1YLBYqVqxIuXLl2L17N2FhYTnaBaWUUsqZdp3ZxS+7fmHu/tlUjE7jiUOFqbbJBVdvD4qP/wrvhg2yld+9ZzeJc16nztl5WPHmN9/7KNxqGJ3DquJiyRvPx/wv7qxwlsdcOoDxSs/wupEySiml1O3mbMpZZh+czcx9M0jdtoPmuy18t9cVz3N2jFc8Bdu1J2TEs7gVznjMktVmZ+7GKFKW/o/OF6ZhgBWF7qVwx5foV7akczuTw+6scHaNM1y55a677uL+++9n1KhRWK1WZs6cyZAhQwD4/fffadmyJStXrsTPzw+/K1xHnzJlCoMHD+bgwYMcOHCAypUr3+ouKKWUUjki3ZbOsphlzNg/g8g9y2myLZ3HtrsRctIG7i74Nm9CwU6d8GneHIunJ5Dx3MuFkfuInvMFPVOnE2QS2Fe0I4W6v0PzIuWc3KPccWeFMyeoU6cOffv2pVatWhQqVIh69eplrvPw8KB27dqkp6czfvz4K9YvVaoU9evX5/z584wZM0bHmymllLrtiAiLDi/isw2f4LftEB23uzFkdzoWmx3P2tXwf7oXvm3b4uLrm63e+p0H2Dfrf3S8MJ025gKnizTB3uVtKpSo46Se3BpGRJzdhhwTHh4uERER2Zbt2rWLqlWrOqlF/839999P586d6d27t7Oboq7idv75UkqpW2Hrsc1MnfwmQWv30ijKgm+iDYu/P/7du+PfuxcFKlTIVl5EiNwdxZE5/6Nl/F/4mmRiCrekSOdXcC0ZfpW93J6MMRtF5LJO6ZkzpZRSSuW4wxFL2fzNexTZdJh7k8FewI2CLVri16EDPq1aYXHPPsWFzS6sXL+e5GWf0SJpAbWMlYOFW+Pe9VVKlKjlpF44h4azPGzChAnOboJSSil1U84ln2XxB09TcfIGSrpCfL2KlOvzKEHN784cR5ZVqtXGokXz8dwwmrvSV2MzLhwq0ZUSnUdSvuideWVCw5lSSiml/rNUWyp/rBmH+7tjCD1g5XDd4tT8eAx1ila4Ynmx21m7dBYuK/9HR3skF4wX0ZUfokyn56joV/QWtz5v0XCmlFJKqZsWnxrP0cSjxCbGcuj8IbbO/JF+U0/hk27BvPAYbe9/4srTP4mwf82fpC35kEbpOzlr/DkY9hxl2j1JeU//W96PvEjDmVJKKaVuyNmUs7y2+jU2Ht9IQnoCAMHxQo81doZuFmxli1Pxi28oULHi5ZVt6ZxaP5mUpZ9SPjWK4wSzMXQUYV2fIqDAFZ6TeQfTcKaUUkqp6zoYf5DHFz3OiQsn6F6hO5XPeVL+7214LdkIxkLAoP4UenYElkunfEqKI3bR13hF/kCI7TQHpShzK7xK016PUfdKDy9XGs7ykjfeeIPvvvuOkJAQrFYr7777brbHPd0qe/fuZfjw4URFReHr60uFChUYPXo0R44c4aeffuKLL75g6dKluLu707hx41vePqWUUrfWhuMbGL5kOG64ML7wcwT+sorERYswnp4EDBxI4P2DcSuafZyY7cRuYud8TOHovyhOGmuoydEqo2jWoT/t/TSUXYuGszzmmWee4bnnnmPXrl00a9aMkydPYrnkQa85wWq14up6+eFPSUmhU6dOfPLJJ3Tp0gWApUuXcurUKcLDwwkPD89c5uPj45RwZrPZcHFxueX7VUqpO9Gfu6Yy/fe3ePigF42iBIl7i2Q/P4KfeIKAAf1xDQjIVv74rtUkLPiQ8nFLKSSuzHNridQfQpsWLWjkrrHjRuT8v/rqMj/99BM1a9akVq1aDBo0iOjoaFq1akXNmjW5++67OXz48GV1qlatiqurK6dPn6Z79+7UrVuX0NBQxo4dm1nGx8eHZ555htDQUO6++25OnToFwP79+2nfvj1169alWbNm7N69G8iY1Hbo0KE0aNCAkSNHsmzZMsLCwggLC6N27dokJCQwceJEGjVqlBnMAFq0aEH16tVZunQpnTt3Jjo6mjFjxvDpp58SFhbGihUrOHXqFL169aJevXrUq1ePVatWAVxxHwAfffQR9erVo2bNmrz++uuZ+/rll1+oX78+YWFhDBkyBJvNltnXZ599llq1arFmzZocPkJKKaUudXbPdhY80o0S/V7lxd/SabQ1DZ/6DSj28cdUWLyIkCcezwxmqelWVi/8k23vtaTI7x0odGYdMwr2Z1XX5XR6aTLd2rbGS4PZjRORfPOnbt26cqmdO3detuxW2r59u1SsWFFOnTolIiJnzpyRzp07y4QJE0RE5Pvvv5du3bqJiMjrr78uH330kYiIrF27VooWLSp2u13OnDkjIiJJSUkSGhoqp0+fFhERQH755RcREXnzzTfl8ccfFxGRVq1ayd69ezO307JlSxERGTx4sHTq1EmsVquIiHTu3FlWrlwpIiIJCQmSnp4uzzzzjHz22WdX7MuSJUukU6dOl7VVROTee++VFStWiIjIoUOHpEqVKlfdx7x58+SRRx4Ru90uNptNOnXqJMuWLZOdO3dK586dJS0tTUREhg0bJj/++GNmX3///feb/vxzm7N/vpRSKqcd2bVB5j/YSbZXqSKbqleRvx9sL2cXzhdbcvJlZdPTrbJ01q8S+UY9kdcLyunXS8nyH16R2OMnnNDy2w8QIVfIM3dUjP1g/Qfsjtudo9usEliFF+q/cNX1ixcvpk+fPgQHBwMQGBjImjVrmDZtGgCDBg1i5MiRmeU//fRTfvnlF3x9ffn9998xxvDFF18wffp0AI4cOUJUVBRBQUFYLBb69u0LwMCBA+nZsyeJiYmsXr2aPn36ZG4zNTU183WfPn0yLwk2adKEESNGMGDAAHr27EmJEiX+9eewcOFCdu7cmfn+/PnzJCYmXnEf8+fPZ/78+dSuXRuAxMREoqKi2Lp1Kxs3bsx8/mhycjKFChUCwMXFhV69ev3r9imllLq27duXsO/T96iw+giFXGBn63JUe+plOla8fPiK2O1ELp6M5+qPaW6P4qQlhL1136B826E0c798oll1c3ItnBljxgOdgZMiUv0K61sAfwEHHYumichbjnXtgc8BF2CciLyfW+3May6OObto6dKlLFy4kDVr1uDl5UWLFi1ISUm5Yl1jDHa7HX9/fyIjI69YxtvbO/P1iy++SKdOnZg9ezZNmjRh3rx5hIaGsmzZsptut91uZ+3atZc9mP1K+xARRo0axZAhQ7KVHT16NIMHD+a99967bPseHh46zkwppXJYuj2dJTtmEvvVF9RdcYJywJE21ag54k1qlbnsn26wprJ/5WTMyk+pbd3PUVOYbXX+j+odh1DItcAtb39+lZtnziYAXwI/XaPMChHpnHWBMcYF+ApoA8QAG4wxM0Rk55U2cDOudYYrt7Rq1YoePXowYsQIgoKCiIuLo3Hjxvz2228MGjSIX3/9lWbNml21fnx8PAEBAXh5ebF7927Wrl2buc5utzN16lT69evHxIkTadq0KQULFqRs2bJMmTKFPn36ICJs3bqVWrUufy7Z/v37qVGjBjVq1GDDhg3s3r2b/v3789577/H333/TqVMnAJYvX05gYGC2ur6+vpw/fz7zfdu2bRk9ejTPP/88AJGRkYSFhV1xH+3atePVV19lwIAB+Pj4EBsbi5ubG3fffTfdunXjmWeeoVChQsTFxZGQkEDp0qX/0zFQSimV3bmUc/yx63dif/mB9oviKZ4KZ1rVouYL71CzVPnshUWwHVpL7PIfCDj4N+UlkcMUYVX1/6N+1yEUc9dQltNyLZyJyHJjTJl/UbU+sE9EDgAYY34DugH/OZw5Q2hoKC+//DLNmzfHxcWF2rVrM3r0aB544AE++ugjQkJC+OGHH65av3379owZM4aqVatSuXJlGjZsmLnO29ub9evX8/bbb1OoUCF+//13AH799VeGDRvG22+/TXp6Ov369btiOPvss89YsmQJFouF0NBQOnToQIECBZg1axbDhw9n+PDhuLm5UbNmTT7//HNOnz6dWbdLly707t2bv/76i9GjR/PFF1/w+OOPU7NmTaxWK3fddRdjxoy56j527dpFo0aNgIzB/r/88gvVqlXj7bffpm3bttjtdtzc3Pjqq680nCmlVA4QEbad3saUPZM5MW8WfRel0jgOUmtXpuyr7xBaLTR7hbQLpK0cTeqGn/FNjiFYCrDStQFSsx9N2/ailKfHlXek/jOTMR4tlzaeEc5mXeOy5h9knB07CjwnIjuMMb2B9iLysKPcIKCBiDxxlX08CjwKUKpUqbqHDh3Ktn7Xrl1UrZo/H5zq4+NDYmKis5txR8vPP19KqfzhQvoF/j7wN1P2TkG27GLQMqh0xAalS1Bi1Mv4NG+e/TFLIpyO+AO3BS/jl3acFbbqbPJvR5WW99I6rAIulis8kkn9K8aYjSISfulyZ94QsAkoLSKJxpiOwJ/AFZ73cG0iMhYYCxAeHp57SVMppZS6jaTb0/lxx4+M2zaOwNhEHlntSZVdNiwhwRR643H8e/XCuLlllhcRNm+JxG3+C9RIWsdue0nGlfqc5q278nSZwGvsSeU0p4UzETmf5fVsY8zXxphgIBYomaVoCccydQk9a6aUUupKtp3axhtr3uD8/j28uLkQlTacx8XHEDRiBIGDBmLxzH5HZcS+Y+ya+n/ckzwZq3FhcenhVOn2HM8G+jqpB3c2p4UzY0wR4ISIiDGmPhkT4p4BzgEVjTFlyQhl/YD+zmqnUkopdbtISk9i9ObRrF38C33WuxK2y47FPY6ABx8g+JFHcPH3z1Y+ITmNGZO/p+n+Twi3nORw8Q4U6v0xrQL//dRK6r/Lzak0JgEtgGBjTAzwOuAGICJjgN7AMGOMFUgG+jkmZLMaY54A5pExlcZ4EdmRW+1USimlbndWu5X5B+cxb/IHNF16mq6HBOPrQeAj9xE4aCCuISGX1Vm7YT32OS8wwL6Jk15lSe4xnVKVWzmh9epSuXm35r3XWf8lGVNtXGndbGB2brRLKaWUyi9SbanM2D6FXRPH0GD1GYaeAgkOoNDzD+Pf9x5cfHyylRcR9kbt4fDcz7nrzGSsxp0jDV6lZNunwcXtKntRt9od9YQApZRSKj+IT41n1srvifv1VxpuSqJ6CqRVKEGR4UPx69IFi7t7tvKHo/dzYPlEgg79TQ3bLioDOwp3okL/jynpX8w5nVBXpeEsD3vnnXeYOHEiLi4uWCwWvv32Wxo0aECZMmWIiIjIfCTURY0bN2b16tVER0ezevVq+vfPGKoXGRnJ0aNH6dixozO6oZRS6j+yi53dcbtZEbOCLdsXU+eP7TTaaUcsBnvz+pR6+Cm86tTJNiWGNTWJzXPG47l9EtXSd1DKCNGuZYms+ARlmg8ktIROA5RXaTjLo9asWcOsWbPYtGkTBQoU4PTp06SlpV2zzurVqwGIjo5m4sSJ2cJZRETETYUzq9WKq6v+eCillDMlpSfxzZZvmLl/JucTTtN1nTB0rWDBgmVQTyo8/BRuhQtnq5N2Opp9sz+n2IEp1COBI5bibCo3hFJ3DaBM2ZpO6om6Gfqv7y3QvXt3jhw5QkpKCk8//TQPPfQQDz30EBERERhjePDBB3nmmWey1Tl27BjBwcEUKJDxWIxLz5JBxoPBe/bsSc+ePXnkkUcyJ6V98cUX2bVrF2FhYdx777189dVXJCcns3LlSkaNGkXnzp158skn2b59O+np6bzxxht069aNCRMmMG3aNBITE7HZbP/qGZtKKaVyxoqYFfzf2v/jeOIxHo6rQYu/0nA7cRbfdu0oPPJ53IoX/6ewCGn7lnJ8wecUP7mMyiKsL9AQ90ZDqdO8KyUtFud1RN00DWe3wPjx4wkMDCQ5OZl69epRt25dYmNj2b59OwDnzp27rE7btm156623qFSpEq1bt6Zv3740b948c31iYiL9+vXjvvvu47777stW9/333+fjjz9m1qxZABQuXJiIiAi+/DLj/ouXXnqJVq1aMX78eM6dO0f9+vVp3bo1AJs2bWLr1q2XPUtTKaXUrRGXEscny97m8Mp59DruR9OjxbDs30yBSpUo/MGneDds8E/h9BSSNv1G0vIvCb4Qhbf48qdPH4q3fpyGYTWzz/yvbht3VDg7/u67pO7anaPbLFC1CkVeeumaZb744gumT58OwJEjR0hLS+PAgQM8+eSTdOrUibZt215Wx8fHh40bN7JixQqWLFlC3759ef/997n//vsB6NatGyNHjmTAgAE33eb58+czY8YMPv74YwBSUlI4fPgwAG3atNFgppRSTnDy1CG2j/mAlKUr6HfUiouAcb+AZ1hFCg56AP/evTEXh5tcOEP8sq9w2TQeH+tZDtlL8lfIs4S2e5ieFYtqKLvN3VHhzBmWLl3KwoULWbNmDV5eXrRo0YLU1FS2bNnCvHnzGDNmDJMnT+bNN9+kS5cuAAwdOpShQ4fi4uJCixYtaNGiBTVq1ODHH3/MDGdNmjRh7ty59O/f/6a/hCLCH3/8QeXKlbMtX7duHd7e3jnSb6WUUtdmFzs7z+xkxcElpEz9k8bzj1I0GWLKeOM2uBclWrTHMywMi0eWB4zb7ZxcPhbv5W/jZ09gib02e8q8RssOvXmoaEHndUblqDsqnF3vDFduiI+PJyAgAC8vL3bv3s3atWs5ffo0drudXr16UblyZQYOHEjJkiWJjIzMrLdnzx4sFgsVK2Y8bjQyMpLSpUtnrn/rrbd46623ePzxx/n666+z7dPX15eEhISrvm/Xrh2jR49m9OjRGGPYvHkztWvXzqVPQCml1KXWH1vP6ytfpey6GPqusFMoHs5VLwnPPEXrxp2u+J/ugzvWYZ/5DOVTdrBeqrG5xst0bXM3Lf08r7AHdTu7o8KZM7Rv354xY8ZQtWpVKleuTMOGDYmNjaVFixbY7XYA3nvvvcvqJSYm8uSTT3Lu3DlcXV2pUKECY8eOzVbm888/58EHH2TkyJF8+OGHmctr1qyJi4sLtWrV4v7772fw4MG8//77hIWFMWrUKF599VWGDx9OzZo1sdvtlC1bNnN8mlJKqdyTbktn9ObRrJo3nmcXulD8mB3XyhUp+r8XqNKk8RVD2Y7oY8T8+Qatzk4hES/+rvAaDbs/Tn1fjyvsQeUHJuOJSflDeHi4REREZFu2a9cuqlbVuVxU7tCfL6XUjTpw7gBvzH+eutN30SZScA0pROHnn6dgp46YS+6mFBE27Iji8PzRNI//ixATz/ZCXSjZ92P8goo4qQcqpxljNopI+KXL9cyZUkoplYvSbelM3TuFVT99yGPz0yiYDEH33Ufwk0/h4pN9nK/NLqxas5oLK0bTMnkh9U060UFN8OzwEtUrNnVSD9StpuFMKaWUygWHzx/mj6g/WLd6Kt1mn+Xxg4Jr1cqUePsdPENDs5UVETasXIB16YfcZdtAKm4cLtWVUh2fo0zRak7qgXIWDWdKKaVUDrHZbSw8vJApe6ewJ2otfVcIr26xg6cHhV56hsABAzAuLtnq7N+0hIR5b1M/NYJ4fNlT5QkqdHqair6FnNQL5Wx3RDgTEZ3zReW4/DReUyn134gIK2JX8OnGT4k5EUX/zd4MX21wsRkCBg0geNgwXAMCslbg9M6lnJn9NpUvRHAWXzZWfIqaPZ6jspef8zqi8oR8H848PDw4c+YMQUFBGtBUjhERzpw5g4eH3i2l1J1u55mdfBLxCUd2rKXLHh9abPHE5dx5CnbsQMjw4biXKvVP4dQEzm+YSNKq7yiSHIWRgiwu9QThfZ6jbsGAq+9E3VHyfTgrUaIEMTExnDp1ytlNUfmMh4cHJUqUcHYzlFJOciThCONWfUbi7Ln03GGhbKwNXBLxadaM4MeG4Vkzy0PGT+wgefVYLNsmU9CeRIy9NKuKP0uD7o/RqvDlz05Wd7Z8H87c3NwoW7ass5uhlFIqnzh0/hA/rPsan5//ptdGG242cK1UjsAXeuHXuROuISH/FL5whuQ5r+C5fSJG3Jhpb0Rs+Xvp1qkrvUJ8nNcJlafl+3CmlFJK5YQD8Qf4bstYEmbOYsBiO35JgkfXjhS7/2E8Lp3v0G7n7KrvKbD0LdysF/jW1oXD1R7loTZ16K2hTF2HhjOllFLqGqLORvHd1u/YvXYODy0UKh2x4VqjGiVeexPPGtUvKx+zcy22mc9QOnkn6+xVWVFpFL3at6ZssD67WN0YDWdKKaXUFeyO283YTd9wZulCWm+3cP8eGxZ/P4q88zx+Pbpnm9VfRNi2bjFpKz6nduJyzlKQaWVepUH3x3guwMuJvVC3Iw1nSimllINd7Gw8sZFZ87/Ge+E67tkBfhcES4AfAQ/1JOjRR3EpWDCzfEpaOuvnTSQg8ltq2naQgBcRxQdRvscr9Awp7MSeqNuZhjOllFJ3vL1n9/L3vpnEzJlO02Vn6BcLdhcLXi3uIrhXH3yaNcO4uWWWP3c+gQ0zvqHCvh+4i6OcMIXYXO0FqnZ8jAY+/s7riMoXNJwppZS6I9nFzh9RfzB120SKL99L5w1C+zghvUggASMfILh7T1wDA7PVOXXqFNtmfEr1wxNpY85yyL0iUQ2+oEKL/hR2cbvKnpS6ORrOlFJK3XGizkbxfytep+ScLTy/3uB9wY5raFUKv/Yovm3aXPaIpSPR+zgw+1PqnPiDViaZPd51SW31PKXrtged4FzlMA1nSiml7hiptlS+3fItq2d/zyPzbBQ7Zce7WVOCHnkEr3r1sj1JRux2tq+aSdqasdS6sJriCNv9WxDS/gUqV23kxF6o/E7DmVJKqXwv3Z7OqthVfL3sA1rMOMTrWwVL0SIU++Y1fFu2zFY2+fxZdsz+ikJ7J1LDHss5fNhcYgDl2j1BzVJVnNQDdSfJtXBmjBkPdAZOishlE8EYYwYALwAGSACGicgWx7poxzIbYBWR8Nxqp1JKqfwpIS2BlbErWXJ4CREHVhAWeZ6RK8Ar1ULQIw8QPGwYFq9/prmwp15gx58fUWrXd4STyG7Xymyo+R412w+mnofOUaZundw8czYB+BL46SrrDwLNReSsMaYDMBZokGV9SxE5nYvtU0oplQ/tjtvNl5u/ZFXsSsofttJ+hxsDd6bjlmqnQO0wir3xJh6VK/1TwZbOoQXf4LP+U2rY44hwD6dA61eoXq9FtsucSt0quRbORGS5MabMNdavzvJ2LaBPkFZKKfWvnU4+zejNo5m9Yxpdtrrz3bYCeB9PxXgVwK9rD/x69sQzLOyfwGWzcnrtRGTpe5ROP8oWU4UdTT+jaauuWCwaypTz5JUxZw8Bc7K8F2C+MUaAb0VkrHOapZRSKq9Lsabw886f+SHyOxpvSmbsGjc84pPwDK+L/9O9KdiubbbLl2JLJ3rpj3it/ZTC6THsllKsCv2Utt0G4VVAp8NQzuf0cGaMaUlGOGuaZXFTEYk1xhQCFhhjdovI8qvUfxR4FKBUqVK53l6llFJ5Q0JaAtOjpvPzzp8os/EY/1vtgf9JK551a1Ho2WfxqlM7W/n09DS2zR1H4cjRlLUdZQ+lWVnxPZp0up/u/vqIJZV3ODWcGWNqAuOADiJy5uJyEYl1/H3SGDMdqA9cMZw5zqqNBQgPD5dcb7RSSimnOnL+CBN3T+SvvdMI3Z7IqI2eFD1ip0DFYoS8OQKfFtnHiqUkXyBy5jeU2PUddeQ4UZayLAn7hPrtB1HZw92JPVHqypwWzowxpYBpwCAR2ZtluTdgEZEEx+u2wFtOaqZSSqk84kD8AT7f+DlroxbTeit8HumGzxk7biUDCX7vFfy6dsk2eWxywlm2/fUp5fb9SEPOEeVaicgGr1KzVX8quliusSelnCs3p9KYBLQAgo0xMcDrgBuAiIwBXgOCgK8d/8O5OGVGYWC6Y5krMFFE5uZWO5VSSuVtqbZUxm0bx9TV39F9HTy81eCanI5XeBiBbw7Gp2XLbKHswunD7JnxCRUP/059ktjqXpvjd40gtHFnjEVDmcr7jEj+uRIYHh4uERERzm6GUkqpHLLu2Do+WvoGYQsO0TXCgpsdCnboSODgwXhWD81WNj5qNUfnfUqF04twETsRXk3wbvU8ofVaOKfxSl2HMWbjleZydfoNAUoppVRWIsK+c/v4cfM4rH/M4sU14J0kFOzYjpCnn8K9dOl/CtusxEVMIWn5aEpc2IERT5b5d6d42+HUD63pvE4o9R9oOFNKKeV0IsLOMztZcGgBSw4soOzqaHqushNyHjwbN6Lws8/iGfrPmbLUlAtEzRtL4W1jCLEe57wUYXrRp6nZeRitSxR1Yk+U+u80nCmllHKaA+cO8Nf+v5hzcA6n449y91bDS+tdKHjWjmtoVYo9+xzejRtnlt+6P4aji76i7tGJVOcc201FllccQYMOA+gR6OPEniiVczScKaWUuqXiU+OZc3AOM/bPYNvpbXilW3gwujSNF3vjGncez9o1CP7wMbybNsmcEuPAsdNETH6ftnETqWkusMerDscaPkNok85U1zsvVT6j4UwppdQtEZsYy5gtY/j7wN/YrGm0O12Mx6IqE7R+P6RE4VWvHsGfPIZXgwaZoSw+KZVFU76h/oHR3GNOczioCa6dX6dyuQbX2ZtSty8NZ0oppXLVyaSTjN06lj+i/qDEaXgjugwVNxyHuCNYChakYNeu+HXvnm1G/3SbnaXzplNs/Tv0ZD9HPStyrvMYSlVv48SeKHVraDhTSimVK+JS4hi/bTy/7fmN4sfS+XBLYUpsjAHX/fi2aE7BLl3wadECi/s/s/THJaayct5kim4fSxvZymlLMDHNPqFE8wdA5yhTdwgNZ0oppXLU6eTT/LTjJ37b8xvFYpJ5d3MIJSOPYfGOI+DRRwkcfB+ugYHZ6uyKjWPznAmEHfmRriaas5ZAomqOpEKn4Rh3byf1RCnn0HCmlFIqR5xKOsUPO35gyp4pBJ9M5e11QZTckoCl4AUCH3+cwPsG4eLnl63O4ZPnWDb5U5qf/JX+llOc9CjN8UYfU6TpfQS4FnBST5RyLg1nSiml/pP41Hi+3fotk/dMxvt8Oq9sKU6llYeweCYS9PRTBAwciIuvb7Y6ScnJrJjyBaH7xzLInOaYXw0SW/+PQjW66OVLdcfTcKaUUupfsdqtTN4zma+3fE3qhfM8t68SteYdgLQjBPS7l+DHH7vs8qVY09jy9xgKbR5NO04S7VmVuA5fUrRme3DcoanUnU7DmVJKqZu2OnY1H274kLiYfTwYVYxGG2wQtwPfNq0JGTGCAmXLZiufELub6IXfUjx6OmFylj0uFdnb8gMqNemhoUypS2g4U0opdUNsdhurjq5i0q6JnFy/kj5bPKi9QzD2GHzuuougRx7GK/yfZzhLWhIHV0zCvvEnKiRFUlUsbHQPJ7X2AzRp1w8XnTxWqSvScKaUUuqajiUeY/q+6czd9geVNhyn2zYLpY/ZsPha8B84iIAB/XEvVSqzvDXhFPv+/pSie36inCRwWAozr+gQSrV8iAaVKzuxJ0rdHjScKaWUuqKdZ3by9aYvObdyOS232nknClytgnuVSgQO6Ytfly5YvP+Z5iL11AEOzvyAMoenUYU01rjWI7nOEBq06kY7D/dr7EkplZWGM6WUUtkcv3CcMcs/wvrXXO7dBIHn7Rg/X/z7dcO/Zw88qlXLVj7h+D6OTnuVCifnUk4MKzxa4tF8OI0aNsVi0fFkSt0sDWdKKaUAuJB+gd9mf0jqb9Povs1KASsUaFCP4Hv749OqVbaZ/AHiTh1n/7Q3qXV0MqUxzCvYkyJtn6FV9dDMZ2MqpW6ehjOllLpDpdvT2Ru3l8hTkRxdv4xyv6+l8YF0rG4WPLt0pMQDQ/CoVOmyesfPnGPbtI+oH/MDdUlirV87Aju9ScfKVZzQC6XyHw1nSil1h1l6ZCk/bP+BHWd24HUuhf5L7XTeLiQVdMc2pD9V738C14CAy+pF79/D3nnfUOPEX7QxcezybYB3p7dpXLX+re+EUvmYhjOllLpDHL9wnPfXv8+iw4uo4FmKl/ZUpvLM7VhsFgIfGUylIUNx8bnkOZa2dA6smsqFteMJvbCBUsD+gvU42fo5qtZq55R+KJXfaThTSql8zma3MWn3JEZvHo3XBRsfxDWj4oI9WGM34dumNYVGjsS9ZMlsdewpCeyfM5qgbd9Rzh7HSQJYV/IBKrcfRsUSl1/qVErlHA1nSimVT4kIa46tYfSGz3DfsINXogKpsOMcWJfgWqMGxd55B++GDbPVSU44y+4Z/6Nc1AQqksB6U5OztV6nSft+NPL0cE5HlLrDaDhTSql8Jik9iRn7ZzB5+69UWXyAp9eDX6Idl0A7fgMG4NezJx6Vs5/9OnYshoN/f0r1mEnU5gIb3MJJaTSChs074KYz+St1S2k4U0qpfOJY4jF+2vkTf0ZNp+rOBJ5d6krgGTuejRsR1L8/PnfdhckyHUaa1c6aDRtIX/UlTRLm0tiksdm7Ce4tXyC87l06HYZSTqLhTCmlbnMXx5R9sfkLihxL5Z0VBSm6x457+RIUfn8UPs2aZit/MiGFOXNmUmLXOFra12E1Luwt3JGgNiOoXbG2k3qhlLpIw5lSSt3Gos5G8cbqNzhycAvPbypE9bUXcPFNJ/jVVwjo2xfj+s+v+dQL51g74zuCdk9ksDnABYsPR6oNoUT74VT3K+rEXiilstJwppRSt6E0Wxpjt45l0oZx9Fpv4aX1BoucInDQQIIfewwXP7/MsnJ0M7ELxxB44E+ak0KMe1nONHqboCYP4F3Ax4m9UEpdSa6GM2PMeKAzcFJEql9hvQE+BzoCScD9IrLJsW4w8Iqj6Nsi8mNutlUppW4H6bZ0/tz/Jz9uGkf1FTF8tdaFAhdSKdilCyFPP4V7iRL/FD4bzYU/R+B9aBHB4sYyt2YUajmU2o3bgo4nUyrPyu0zZxOAL4GfrrK+A1DR8acB8A3QwBgTCLwOhAMCbDTGzBCRs7ncXqWUypNSbalMi5rG5LXfUWvVCV7bYsH3vB3vJo0o9OyI7A8jt6aSuOQTCqz+BLFb+NT0J7jFUPrdVUPvvFTqNnBT4cwY4yUiSTdaXkSWG2PKXKNIN+AnERFgrTHG3xhTFGgBLBCROMd+FwDtgUk3016llLrdnUw6yawDs1i66AcarjzDWzvB1SZ4N2tM0IMP4t2oUbbyF/YsJmX6cIJSDjHH3oDdtV5icPvGBHq7X2UPSqm85obCmTGmMTAO8AFKGWNqAUNE5LH/uP/iwJEs72Mcy662XCml8r0L6RdYdHgRiyKn4b4sgsY7bIyKBfFwJ+CeXgQOGkiBcuX+qWC3cW7rHM4t/4YycSs5bS/E5JIf0KnnfXQI8nJeR5RS/8qNnjn7FGgHzAAQkS3GmLtyrVU3wRjzKPAoQKlSpZzcGqWU+veOJBzh+zVfcH7+AupvT+PRQ4JFwJQrTcjIvvj37oVLwYKZ5a3xx4leOAb/XRMJtp4gTfyZ6jeYKj1fZliZwk7siVLqv7jhy5oicuSSCQltObD/WCDrA91KOJbFknFpM+vypVdp11hgLEB4eLjkQJuUUuqWOpdyjm8jvyFu4kT6LrXimQb2YoUIebQ7BTt1wqNS9tn8E/evI3bux5Q/tYgK2FhnarCy4tPUaj2A3oX9ndMJpVSOudFwdsRxaVOMMW7A08CuHNj/DOAJY8xvZNwQEC8ix4wx84B3jTEBjnJtgVE5sD+llMozUqwp/LLrF+YvHMvAmYlUPCq4NQyn+DPP4VGzZvYZ+u02EiKnE7/kc0okbKWoeLKwYHe8mzxMw3oNdaC/UvnIjYazoWRMeVGcjLNa84HHr1fJGDOJjDNgwcaYGDLuwHQDEJExwGwyptHYR8ZUGg841sUZY/4P2ODY1FsXbw5QSqnbXVJ6ElP3TmXilgncteA4r60Dl4K+FPvoFQp27pw9lKUlkbD6e6yrvyYg7ShnJYQphR6nZucnaF+6mPM6oZTKNSbjRsn8ITw8XCIiIpzdDKWUuqIzyWeYtOVHtiz8jYp7E7kryg2/s2n49ehBoZHP4xoQ8E/h1ETiV4zBZe2X+FjPstFeia0lB9Gsy2AqFPG7+k6UUrcNY8xGEQm/dPmN3q1ZFngSKJO1joh0zakGKqVUfnXszCGWfPMKZu1mmkbbuNsK4uaKd3gdgh99NPt0GCnxnFv2FW7rx+Bni2elvTrby79Fh869eCDI23mdUErdMjd6WfNP4HtgJmDPtdYopVQ+Ep8azx/T3qXM6JnUjhMSCvvg1aMlRe/uiHf9+li8skxzYUvn2MLR+K37BH97AkvstdlTeQidO3ajaYBOh6HUneRGw1mKiHyRqy1RSql8IsWawu9bfuLs6K9pszaVpCAvvL56nap3X36xwWq1sWXRbxRd/w7FbLGslhpsr/oMXTt0oqWfhxNar5RythsNZ58bY14n40aA1IsLLz4HUymlFByIP8C8g/PYtHAi/f44Tf2zQI/21Hn5bVx8sl+SvJBqZfbCBZSNeIdw2Ua0Kc7cWl/QqO29NNbZ/JW6o91oOKsBDAJa8c9lTXG8V0qpO9aR80eYGz2XNZEzCV6/n/p7hWeOCFI4mFITPsK7YcNs5dNSU1k9+2c8t06gj2wjwfiyu/arVOz4FGXcNJQppW48nPUByolIWm42Rimlbhenk0/z+YI34O8l1N9j5/njGctdypcl4PFOBD1wPxbvf86W2c/FEDXnS4L2/kYLOcspl0IcrfUcxdo8QRXPgCvvRCl1R7rRcLYd8AdO5l5TlFIq7xMR5mycxL4vP6J3RAruVnAJrULggI74tm5NgbJls5c/e4ijM96iyMFpVBQhwq0uRxs/Qo3mvTEuN/yQFqXUHeRGfzP4A7uNMRvIPuZMp9JQSt0xjsbsZvH7T1F96RFK28C1Y2vKPvUc7qVLX1bWHn+UIzP+j2L7JxMsMN2tIwVbPEXrxvWxWMwVtq6UUhluNJy9nqutUEqpPMpqt7JtzwqOff8tRedtobYV4u6qTvgLH+BZrtxl5W0JJ4me8R4lon6hmNiY69YalxbP071RXVz1EUtKqRtwQ+FMRJbldkOUUiqvOJN8htVHV7N581xCpq+icWQqpe2wr25hwl54h9AaTS6rE3f8EEdmvU/lmKmUlXQWuTVHWrxIh0YNNJQppW7KNcOZMWaliDQ1xiSQcXdm5ipARKRgrrZOKaVugaT0JDad3MTao2tZe2wtSXt2032tnZ67BCwWkto2ouTjzxJaITRbPRFh+85txC/4mPpn/yYUO6u9WiFNR9CqUWNc9PKlUupfuGY4E5Gmjr99b01zlFLq1olNjOWD9R+wInYFVls6tQ+58OBmL8rutiGeBQgcfC9B9z+AW+FC2eqJCBu2bOXc3HdpmbwAgC0hnQluN5K7KlZ3RleUUvnIjT5b82cRGXS9ZUopdTuwi51Juyfx+abPcbXBC2caELpgP64HYnAJKUjg8OEE9OuLi79/tnoiwvptuzgz513uTpqDMbC/VG9KdX2Z8JDLbwpQSql/40ZvCMh2Lt8Y4wrUzfnmKKVU7jpw7gCvr36dyJObGXyyMp3nnEGOLaNAxQoEvvMOBbt0xuKefTLYjFC2h5NzP6TNhRm4GjsHSvagdI/XqBKkoUwplbOuN+ZsFPAS4GmMOX9xMZAGjM3ltimlVI5JSEvgl12/8N3W76h0xp0fV5XGc9tO3CtXptCbb+HdrBnGZB8jZrcLa1YuJHXVNzRJWYarsXOweGdKdn+DSoXKO6knSqn87npjzt4D3jPGvCcio25Rm5RSKsecSjrFL7t+YfKeyZj4BF7ZXJyqK2NwKWgIeeMN/Pv0xri4ZKuTlprKxrk/4rvle5rYd5OEB4fK9KF0h2eoUKSyk3qilLpT3OhUGqOMMcWB0lnriMjy3GqYUkr9F9Hx0UzYMYEZ+2cQHGdlxJ7ihK5NxaTGEjBwACGPP46Ln1+2Ohfiz7B95heU3fcTjYgj1lKU7TVfomqHoVTy9LvKnpRSKmfd6A0B7wP9gJ2AzbFYAA1nSqk85UjCEb6J/Ia/D8yixmELH+0IptiWo+AaS8EO7Ql+9FEKVKiQrU780X3sn/kxlY9NpwEpbHcP40SjD6jevBfFLS5X2ZNSSuWOG70hoAdQWURSr1tSKaWc4PiF44zd8i2bVk6j3j5h7AEfCsacxSUgiYBhQ/Hv1w+3QtmnxDi+YxWnF/6PqnGLqYGFjb4t8bt7ONVrN3NSL5RS6sbD2QHAjSzP1VRKqbzgeOJx/p78HomLFtMyykqf84AxeIaVwX/YsxTs3BlLgQKZ5cVmZf/KKciar6iYsg0v8WJZcF/KdhxBw/I6nkwp5Xw3Gs6SgEhjzCKyP/j8qVxplVJKXcfes3uZvvBLyo9bRONDdqzuLng0akxI2074tGiOa1BQtvLW5AR2zR1D8LbvqWA/RiwhLCr9DNU7P0GrkGAn9UIppS53o+FshuOPUko5jYiw7vg6fo4cT5GpK+m2VpAC7hR4YQiV730Qi4fHZXVSzh4jatYnlNo/iRoksMNSmb1hz1Kv/SDuvkJ5pZRythu9W/PH3G6IUkpdy8H4g7y37j0SV69iyHxDoTjBo2M7Sr70Cq7Bl5/5SozdyeFZH1L+2CxCxcr6Ag2h8ZPUv6sjFn3mpVIqD7vRuzUPkv3B5wCISLkcb5FSSmWRlJ7Et1u/ZdWCCfRZaafmPjtupUtR9OPX8W7c+LLyFw6s5djf71PuzFLKiysrfdoS0Go4DerUu2ySWaWUyotu9LJmeJbXHkAfIDDnm6OUUhlEhPmH5jP1j7dptfAM/3dQMP5+BI94iMDB92Ub5I8IKbvnc2behxQ/F0GIeDHbvz9lO4/g7ooVrr4TpZTKg270suaZSxZ9ZozZCLyW801SSt3J7GJnyZElzP/rc+r8vY8R0YL4+VLouUcJuPdeLN7e/xQWIXX7TBLmvU1w4h4sEsikwCFU7/wUncuXcF4nlFLqP7jRy5p1sry1kHEm7UbPuiml1HWl29OZc2A2y/78kkbzY7j/MFj9vAl+bihB/ftj8fL6p7AISTvnkDjnLQol7iLWXoQ/gkZQr8sQ7i1XxHmdUEqpHHCjAet/WV5bgWgyLm1ekzGmPfA54AKME5H3L1n/KdDS8dYLKCQi/o51NmCbY91hEel6g21VSt1GktKTmB41jQ1/jqXFolM8HAvWoIIEjxpG0D19sXh6/lNYhMTdizg/+w2KJWzjjD2EscHPEdZpCEPKF7r6TpRS6jZyo5c1W2Z9b4xxIeNxTnuvVsdR5iugDRADbDDGzBCRnVm2+0yW8k8CtbNsIllEwm6kfUqp28+ppFNM3Pkre2b+QodlFxhyHGyFAin82hP49+r5z5iy5HNwYCnx2+bA/sX4pZ8kQQL5JWQ4tbo8zqOlNZQppfKXa4YzY0xB4HGgOPAXsNDx/llgK/DrNarXB/aJyAHHtn4DupHxfM4ruRd4/WYar5S6/ew7u4+ftk/g5OwZdF9lpe1JQYoVosj/PYF/t24Yd/eMgrtnY1/5GcRGYBEbRjxZLTU4V/x+and6hIElNJQppfKn6505+xk4C6wBHgFeBgzQQ0Qir1O3OHAky/sYoMGVChpjSgNlgcVZFnsYYyLIuIz6voj8eZ39KaXysG2ntvHT+m9IX7iMLuuh2Bk7pnQJin7wBAU7dcK4On4dJZ8jacZzeO2awhGKMcPahV1e9Qhr1Jpe9coS5FPg2jtSSqnb3PXCWTkRqQFgjBkHHANKiUhKDrejHzBVRGxZlpUWkVhjTDlgsTFmm4jsv7SiMeZR4FGAUqVK5XCzlFL/hYiwIXYdC6Z+TKHlOxm4V3C3gmulChR+5TF827bFuLhklt254k+KLnuegtYzjLZ1Z0u5RxnQuCKPVQrBRSeOVUrdIa4XztIvvhARmzEm5iaCWSxQMsv7Eo5lV9KPjMulmUQk1vH3AWPMUjLGo10WzkRkLDAWIDw8/LKJcpVSt1a6LZ3IU5Gs27sYy++zqLXuND0TwerjgX+vLgT36o1HjRqZE8JabXb+Wr8X18Vv0C19Dgcozozq4+h+dweeDPS6zt6UUir/uV44q2WMOe94bQBPx3sDiIgUvEbdDUBFY0xZMkJZP6D/pYWMMVWAADIunV5cFgAkiUiqMSYYaAJ8eIN9UkrdYmm2NGYdmMWSw0uIPLKO5msv0H2tHa9USAivTOEBj+LfqjWWi+PJyDhTtixyD4fmfkbXlJn4mQvsLX8/pXq9w/1ePk7sjVJKOdc1w5mIuPzbDYuI1RjzBDCPjKk0xovIDmPMW0CEiFx8kHo/4DcRyXrWqyrwrTHGTsa8au9nvctTKZU3pNvTmbFvBt9u/ZaT54/Sc48/ny+z4nnOjsddTSk64lk8qlS5rN72nTs4NOsjWl6YTQuTyolirTCdXqZSifAr7EUppe4sJnsmur2Fh4dLRESEs5uhVL5ns9v4++DffBP5DbEJR+gbU5JuS5JwiT2JZ506FBrxDF7hlwQtu42DEXM4uXw8dRKWYgwcKtaRUl1G4VY01Cn9UEopZzLGbBSRy/5XqrP8K6VumIiwInYFH0d8zMFzB+h8uiQfLCuOW1Q0BSpWJOSbN/Bp0SL7A8bP7OfYsu9x3zGZsrZTBOHN9uJ9qNz9RcoXKuu8ziilVB6l4UwpdUP2nd3HRxEfsfroapqdK8Kbq8vhsSUKt+LFCfnwg4zpMFz+GQkhRzYQ//er+B9fQyExrDFhXKg6gkadBlHbx9eJPVFKqbxNw5lS6priUuL4OvJrZm2dTMs9bvywOwTv/TG4BAYS/PLL+Pe9J/tA/1N7OfXXKxSKmUe6FORLlwEENRlMt2Z18XLXXzlKKXU9+ptSKXVFCWkJ/LrjF9bMHU+jTUl8t8fgmpZEgcol8X/5Qfx69MDFxzuzvD3+GDF/vU7xA1PwEnfGufXDp8VwHm5QCQ+3f31vkVJK3XE0nCmlsklIjufvPz/m1JyZ1N6VSvMEwMsT/x5d8e/dG4/qof+MKbPbSdq7hBNLv6X48UUUEeEv9w64thjJ4IY1cXOxOLUvSil1O9JwppQC4OyhKDb/71W8Vm2l1gXB6mrBpVE9inXpjW/r1li8skwIm3CckyvG4xL5M0FpRwkQb+Z5dsSj2WN0a9RQZ/NXSqn/QMOZUne4w/GHWTHuLar8tIogGxyuUQjf7gOo1mlAtsuWAHHHD3Nq9juUOzyVQlhZZ6/G3JIPUaP1QLqULeKkHiilVP6i4UypO5CIsOnkJqasHUfV75dRf69wokIQhd/9P7rUbJmt7PmUdJZs3oNlzWhax0+jHDbmurcmqe4w2jZrQgNv96vsRSml1L+h4UypO8jFecrGbh2L26rNDJ0j+KQZvIYPpfkjj2ebCiMl3ca4RdtIX/U1D1lm4mOS2RXSjgKtX6Jz5ZrZ5zJTSimVYzScKXUHsIudRYcXMS5yLJ4bdtJ7sxuV99txr1KZ4h9+iEelStnKL9l+mK1/fUr/tKmEuJznXKnWmI5vEFqkhpN6oJRSdw4NZ0rlY1a7lTkH5zBx3VjKrjjAU5EWAs/acSnsR+CzAwkaPBiTZY6yI6fiWfz7Z7Q5NYGWJo5zRRtBp7fwL1nfib1QSqk7i4YzpfKhFGsKf+77k6nrvqfx/Fhe3Aru6YJneB0CBw7Et/XdGNd/vv7Hz5xjw4wx1Ij+gcHmOCf8qpPeZTz+FVteYy9KKaVyg4YzpfKRhLQEJu+ZzJRNP9J02WlejTC4WQ1+3bsRNHgwHpUrZyt/JOYwu2d+Ru3jU+li4onxrEBc258oXLsr6JgypZRyCg1nSt3mrHYra4+tZcb+GazYv4iW61N4d50FzyTBt0N7Qp56kgJlsz9gPGrnZk7O/x91z86lpElnj18j7HePoETNNhrKlFLKyTScKXWbOhB/gOlR0/n7wN8knj1J163ufL1B8Dhvx7tZEwo9MxyPatUyy1ttdtavXIDL6s+pl7KaUriys1BHSnV8jsplazqxJ0oppbLScKbUbebig8in7J2C/wXDo7uLErbCDUtSMt5NmhA8dAhe9epllo+/kMbKeb9RdPu3NLZvJwFvtpR5iPKdR1A7pLgTe6KUUupKNJwpdZtIt6UzcfdEvt3yLf7HE3l3X1nKrYqGtEP4tm1L0COP4Fk9NLP86fNJrJz5A5X3jqWTieaMJZjdtV6kYvvHqe1Z0HkdUUopdU0azpTK41JtqSw7sozRGz8ncHM0r233o/TudIzbQQp27ULQQw9ToNw/Y8pOnE1g7Z9jqBE9nu7mKCfcSxLb+GOKNxtMkKvO5q+UUnmdhjOl8qD41HiWxyxn8eHFbN+7gvAtyYzc4kLwGTuuhdwIGP40/n364BoUlFnnVNxZ1k/7nLAjP9PNnCbWsyLHm4+hSIN7wOJyjb0ppZTKSzScKZWHRJ2N4uOIj9l8eC119lhpvcuNh/enYbELHrVrEvTyIHzbtMG4uWXWSTx3mi3TPqLqoV/pZBKI9q7OqdafUbx2Z73zUimlbkMazpTKA9Jt6YzbPo7py8fQZ7WFJ3YLrsl2XIsG4PdwV/y6daVA+fLZ6qTFxbD3rw8oe2gyTUhhm3dDktu9QJlarZzUC6WUUjlBw5lSTrbj9A7eWzCKsNn7+GQTuLi549epC35du+FVLxxjsWQrnxC7h5hZ71Hh2EyqiJ01Xs0Jaf8CNWo1clIPlFJK5SQNZ0o5yfm084yPGEPcTz8xYo0dj3QI6NWL4CeewK1w4cvKH965nnPz3yf07GLK4cpi7/b43T2CpnXqYPTypVJK5RsazpS6xY4lHuOXbT9yaurvdF2eQlACeLRoRrHnRlKgQoVsZUWEHRuWYF3yAWHJawkUT5aF3Evx9s/S7pKySiml8gcNZ0rdIrvjdjNh2w+cnz2b3sttFD0rEFqJ0i++km3SWAC7Xdi4egEuyz+gTloE5/BhTamhVOo6glbBl59VU0oplX9oOFMqlx2IP8AXGz/n7OKF9F8OpU7asVQoR7F3nsOnZYtslyST02ysWz4bn7WfUM+6iXP4sqniU1Tr9iyNfPyd1gellFK3joYzpXLJyaSTfLPlG1atn8bD8+3U2G/HpWQJCn88nIIdO2QO9LfZhTVRJ9m34ndqHfmZFmYv50xBtlcbQeUuz1BHZ/NXSqk7Sq6GM2NMe+BzwAUYJyLvX7L+fuAjINax6EsRGedYNxh4xbH8bRH5MTfbqlROSUxLZPz28Uzc9hNt16byyUrB1c2dQi+NJODefplzlJ08n8KEpduxRP7KPdaZNLWc4ox7UQ7WeoXSrYdR3cPHyT1RSinlDLkWzowxLsBXQBsgBthgjJkhIjsvKfq7iDxxSd1A4HUgHBBgo6Pu2dxqr1L/VZotjcl7JvPt1m8ptP8snyzyIuCoFd82bSj88ku4FSkCQFKalZ8WbULWfsMQ5uFnLhAXUpu0Fh8TFNqFIJ3NXyml7mi5eeasPrBPRA4AGGN+A7oBl4azK2kHLBCROEfdBUB7YFIutVWpf80uduYcnMPoTV9QcFcsz233p9IWG65FfSjy9fv4tsqYFNZmF2at2kjiks8YZJuPp0kjuXwHaDGCwJL1rrMXpZRSd4rcDGfFgSNZ3scADa5Qrpcx5i5gL/CMiBy5St3iudVQpf6NdFs6S2OWMn7TtwSt3MXzke4Ui7Vh8bMSMGQIwY8+gsXbG4AtO3Zy5K+3aJ+6AFdj51yFrni3G4V3oSpO7oVSSqm8xtk3BMwEJolIqjFmCPAjcFPPnjHGPAo8ClCqVKmcb6FSl9gdt5s/9/3Jmk0zqLf+HM9sMfhesONevjiBb96HX9cuWDw9AUhKPMeGX96g/rFfqWbsHC3bk1JdXyIosKyTe6GUUiqvys1wFguUzPK+BP8M/AdARM5keTsO+DBL3RaX1F16pZ2IyFhgLEB4eLj8lwYrdTVptjT+2v8XU3f+jmfELtpuhq4H7IDBp3kzgu67D69Gjf6ZFsNuY9/8bwhY+xHNOce2gNaUv/dDShcuf839KKWUUrkZzjYAFY0xZckIW/2A/lkLGGOKisgxx9uuwC7H63nAu8aYAMf7tsCoXGyrUleUlJ7E1L1TmbXie6pHnOKZba74xduxhAQTMLQ3Ab1741Y8yxV3ES5s+5uE2a9SIeUA2yxVONZuPDUa3O28TiillLqt5Fo4ExGrMeYJMoKWCzBeRHYYY94CIkRkBvCUMaYrYAXigPsddeOMMf9HRsADeOvizQFK3QrxqfFM2fIzUX/8RL3NCbx5GMQYfBrXx79fX3xbtMicEgMAEc5unUPy/P+j2IWdnJLCTC3/Dp36DsWzgLNHDyillLqdGJH8cyUwPDxcIiIinN0MdRtKtiaz+eRm1h9bz4Ety6k8bw+Nd2Y8jNxeogiFe/fDr1tX3IoWzV5RhKOb55K28G3KJG0nRoJZXuQBwroMo1qJIOd0Riml1G3BGLNRRMIvXa7/pVd3tKizUfxv4/9Yd2wd5Q6n032d0G6vHZubCy7tW1P63gfwrF072yOWABBh75q/MMs/pmLKNo5LIH+VfI7aXZ+kfyF/p/RFKaVU/qDhTN2RktKT+GbLN/yy/ScaHSrAlxv9CNx7AoufH4GPDSBg4ABcAwMvq2ez2YhcOAm/DZ9RyRrFcYJYWOZZanV7mm4Bfk7oiVJKqfxGw5m6o4gICw8vZPTS96i+7gRjt3riczoB12I+BL30Ev69e2Hx8rqsXsyxE+xb9isl90ygrhwi1hRmVdXXCOsyjNZXKK+UUkr9WxrO1B1jT9wefp7xfxSZs4m3d4CbVfAKr0rAKwPwvfvu7AP8gSOnzrF9+TR89kyjXupaSph0jriUJLLOh9Ro9yDFXd2usiellFLq39NwpvK96Phovl/5GYV/ms+ALYKtgBsBPboRNHAgHpUrZysrImzYsJq4pd/Q4MISOphE4k1B9pXoQXCjQZQMbUbJS8efKaWUUjlIw5nKt45fOM6Yzd9wbvo0+i+24Z1m8L6/P8WHPYmLX/bxYWJNY/viSbDhO+qnbyMVNw4Xvhtrw4GE1GqPn4ueJVNKKXVraDhT+c6OMzuYuGsi29f9zQNz0qkcY8c1rAYl33wbj8qVspW1JZ5h/5zPCdz5MzUkjmMmhMjKz1Ct0+NULBjipB4opZS6k2k4U/lCuj2dRdELmbvse7zX76L+PsOAGBuWggUp8s5I/Hr0wFgsmeVTTh/iwMyPKHtoCpVIYb1LGLvrvEWDtv0o6qZnyZRSSjmPhjN1W4tJiGHOul9InTiVarsuMORsxnK3KpXwG9aagEEDcQ0IyCx/9tA2Yv9+n8on51BJhBUeLbA0fZomje/C1cVylb0opZRSt46GM3XbSbWlsvjwYqbv/oOAWWvou8KOu81gq12NQh16UrBlS9yKFctW58zBrRyb8SbV4hbhgRvL/LoS3GYELarXuHyCWaWUUsqJNJyp24Zd7EzcNZExW8cQcuAcjy2wUOKYHdfG9Sn95tu4lyx5WZ0z0ds5OuNNQs8swAN3loQMoFzXF2hdqpQTeqCUUkpdn4YzdVs4lniMV1e9yvaDaxkeUYiaq+y4hQRR+POX8W3b5rKzXyejIjg25yOqn5mHJ+4sDbmXit1HcXcJDWVKKaXyNg1nKk8TEWYdmMX7a9+lwZYUvlvmgWvCCQLvG0Twk0/i4uPzT2GblZi1U0ld9TXlk7bgK+6sDL6Hct1fplXJ0s7rhFJKKXUTNJypPCsuJY63177N3vXzeWOxJyWiU/CsXZsir76CR7VqmeUk6SzR87/Gd9sPlLCdIlZCmF/iCUI7P0bzosWd2AOllFLq5mk4U3lOXEocE3ZMYObmSXRbmszDG+24BrhT6N138eveLXNKjOTTh4ie9TGlo6dQlmTWmxqsD32RJh0G0tbHw8m9UEoppf4dDWcqzziTfIYfd/zIktUTuXttMp9vN7hZhYD+/Ql56p9Z/Y/ti+Tk3A+pdnouFUVYUaAZaQ2epPldrfBwc3FyL5RSSqn/RsOZcrrjF47z846f2D7/N9qsTeHDfQJurvh16Urg/YPxqJQxq//5syfZ+/NwwuP+xl/cWeXfFf/WOh2GUkqp/EXDmXKaA+cOMH77ePYvnUn/xel0PioQ4Efw4wMJuLcfrsHBGQVF2DZvPMXXvkmYJLCqcH8q9HiJFkUvnzpDKaWUut1pOFO3XOTJSL7f/j1Rm5cwaCkMiLJhCodQ+P+exK9rVywFCmSWPRMTxfGJj1MjaR17XCpysstvNAlr7LzGK6WUUrlMw5m6JexiZ0XMCsZvH8+BAxsZuMqVYZvtWLy9CB4xhMD7BmHx+GcQf3L8GXb/9RFVDoynjMDy8s/SsN8o3N31uZdKKaXyNw1nKlel29KZfXA2E3ZMIPFAFH03ejByC1jERsCggQQPG5bt2ZfnT8US9df7VImZTG1SWF+gMYXu+ZS7yldxYi+UUkqpW0fDmcoVF9IvMHXvVH7e+TPe+48zYKMX1bfbsbil49ejF0EPP4R7lkconY7ZR/SM96h+4i/CsLLBuzned4+kft0mTuyFUkopdetpOFM56mTSSX7d9Sszt04mdPt5Ru72puR+GxZfCHjkEQIHDcQ1JCSz/PF9mzn29/tUj1tALSDCvx0h7V+kYdVazuuEUkop5UQazlSOOBh/kAlbvufQohk03Wbj8yhwTbfjViqAgOeH4t/3nmyPWjq6fTlx8z6gesJKCkoB1of0pHTn52lUprITe6GUUko5n4Yz9Z/sPLOTH9d/g9tfi+kYIQQkChT0xb93J/y6dsUzLOyfOcisqRxa+Rvp68ZRIXkrXuLN0qIPUqXbszQpWsK5HVFKKaXyCA1n6qaJCBEnIpi46msKzVzHPZsFr1RwaxBOoYH34dO8ORZ398zyttMHODT/K4KjJlNaznNECrOw1FPU6j6cFkFBTuyJUkoplfdoOFM3zC52lhxewp9LvqbivF08sE1wtRm82ram8CND8awemq18yqEITs94jRJnVlFKLKx0qU9y2GCatu1Fa88CV9mLUkopdWfL1XBmjGkPfA64AONE5P1L1o8AHgaswCngQRE55FhnA7Y5ih4Wka652VZ1dem2dGbtn8ni2d8QvjiWoVECri749ehOoYcfwb106Wzlk47u5Nj0Vyl/aiFe4sNvPgMp1PwRmofXwsWij1lSSimlriXXwpkxxgX4CmgDxAAbjDEzRGRnlmKbgXARSTLGDAM+BPo61iWLSFhutU9dm4iwO243cw/MJmbWH9y14iyPHQWbrxfBQwcRNGDgP49Xckg4cZAj016j8omZFJYCTPcbSMlOI+lXufRV9qKUUkqpS+XmmbP6wD4ROQBgjPkN6AZkhjMRWZKl/FpgYC62R92AvWf3MvfgXBbvm0uZ1Yfotk5of1awFStE4VeH4N+zBxZPz2x1zhzezZFZ71Ht5CzKCywu2IPCnV+iR+WKTuqFUkopdfvKzXBWHDiS5X0M0OAa5R8C5mR572GMiSDjkuf7IvJnjrdQARljyZbHLOf7bd+z58hm2kXCyxst+Jy341atKoVeH4Jvm9YYF5ds9Q7vXEfcvA+ocW4xPriwzq8DIR1G0aZq6JV3pJRSSqnryhM3BBhjBgLhQPMsi0uLSKwxphyw2BizTUT2X6Huo8CjAKWyzDivri/dns6cg3MYv2088Yf30XurFyM3ueKalIp34wYEPfoIXg0a/DMVhsPu9fNJXfIxtZLXESQerC58L2U7P0+zUuWc1BOllFIq/8jNcBYLlMzyvoRjWTbGmNbAy0BzEUm9uFxEYh1/HzDGLAVqA5eFMxEZC4wFCA8Plxxsf74VnxrP9Kjp/LrrFwruPU7/SC+q77BjTBIF27Ul8KGH8AzNfvZL7DZ2L5+KWf05VdJ2cBZfVpcaQpUuI2gWUsRJPVFKKaXyn9wMZxuAisaYsmSEsn5A/6wFjDG1gW+B9iJyMsvyACBJRFKNMcFAEzJuFlD/wc4zO/lt92/MOfA3tXam8OJGT4odtmEpaAh48EEC+vfHrVixbHWsKRfYs/hnfDd+TVXbIY4SwupKI6nV5Qka+/o5qSdKKaVU/pVr4UxErMaYJ4B5ZEylMV5Edhhj3gIiRGQG8BHgA0xxXDq7OGVGVeBbY4wdsJAx5mznFXekrmt5zHLGbh3L1hOR3BXlxuh1BfCLTcK9dDABr43Av3t3LF5emeVjTp1l/5oZeO79i9DEVYSSwn5TipU13yW840MU8/BwYm+UUkqp/M2I5J8rgeHh4RIREeHsZuQZcSlxvL/+febun02Xg4H0Wm3HM+YM7uXLEzxsGAU7tM8c5J9mtbNg3kxcN/9Ao/S1FDTJxOPD7oCWUL0XdZp3xc3V5Tp7VEoppdSNMsZsFJHwS5fniRsCVM4SEeZFz+Ojle8QtukcP2z2wev4KQpUrEDwJy/h265dZigTu511i6fjvvpTOtm3ccF4E1usDQl1+lCsdnsauLpfZ29KKaWUykkazvKZk0kn+WjR63jOXM77myz4JFrxqF6SoBffxLdtG4zFklHQbmfv8t9h5Sc0tO7ljAlkX+1RlG//OJUK+Dq3E0oppdQdTMNZPnEg/gBTV32L22+z6b/Zikc6eDVrTPDDj+BVv17mdBhiS2f/kp/wWPc5ldIPEUNhImq8Tu0ujxHkrmPJlFJKKWfTcHYbExE2HN/AtBVjKDp9Le23Ci5icG/XmpJDn8SjcqXMsva0FHbP+5bAzV9TwX6c/ZRkYbV3aNrtEUoU0IeQK6WUUnmFhrPbkF3sLDq8iGmLv6LG7L0M3CEYiwXvHt0oNvRx3EuUyCwrtnR2zPiCIlu/pJrEsdNUYFftz2jUYSDl3d2c2AullFJKXYmGs9vIxRn9Z83/ivrzj/DEbgE3N/z796HQI4/iVrhwtvKxm+dh+/sFqlsPstUllN31P6Th3T2ppnddKqWUUnmWhrPbQLI1mRn7ZrB4zhiaLjrBM/sEu2cBgh4aSPD99+MaHJyt/IWT0Rz6bQTV4hYRSwhLwz6hWZcHcHGxOKkHSimllLpRGs7ysNPJp/lt1yS2zvmF1ssTGH5IsPt6E/zkgwQOHICLX/YZ+q0Jp4ma+RFl946nrMC8wg8Rfu9rtAjwd04HlFJKKXXTNJzlQVFno/hl+0+cmDODLqvTuPs42IP8CRn5CIF9+2Lx9s5WPuXMYQ7O+IAyh6ZQlVSWuzcjsPt7tKtWw0k9UEoppdS/peEsj0ixprDg0AKm75hMwcWb6LZOKHJWMKWKU+TtoRTs2hWLe/YJYRNjd3Jk5ntUOD6bimJnhUcLCrR4lqYNmmCxGCf1RCmllFL/hYYzJ9sTt4c/ov5gyY4ZNFqbwJBNhoKJdtyqVaXQG0PxbX135mz+F53auYK4BR9RMW45ZXFliW8ngto8S4uaNTPnM1NKKaXU7UnDmRMcSTjCvOh5zDk4h7MH99A1wvC/rYJbqh3vZk0JeughvBo0yB607HYOrpmGfeVnlE/ehpt4My9oIKU6DKdtxQrO64xSSimlcpSGs1skPjWev/b9xdzouWw7vY2KscKArQWputWOsVjw69SZwAcfzDZxLAB2O3uW/ITXmv9R1nqYoxLMvFLPUL3z43QoHOKcziillFIq12g4y2Xp9nSm7JnCN1u+4XzyWXrGFmP4+kL47j2KxdeK/wMPEDhoEG5FimSvaLezb/kk3Fd+SGVrNAdMCRZXe4fwTg/SztvLOZ1RSimlVK7TcJZLRIQVh5by6+z3KbAvhsfiCxG2LwBOHMGtVCkCX34Z/549LrvzErudg6umYFn+PhXSDxBNMZZUf59GXR6iXAH3K+9MKaWUUvmGhrMcJiLs/OlLTkz6maCYBIZbM5ZbfBLxDAsj4LV++LRocdkgf0lPIWrReLwjvqGs9TCHKMqiqm/TqNujlPHQZ18qpZRSdwoNZznEareybOtfXPi/j6m44xzpRVyI61CfWk174F2zFu6lS2Msl8/Qb71wlr2zv6DwzglUkjj2UoZFVd+mQddHKO3p4YSeKKWUUsqZNJz9R/Gp8fy570+2/TGO3n+eJiQNjjzQmsZPv4OvR8Gr1ks+fYgDMz+mzKEpVCOZCJdabK/7Hg3b9KaSmx4WpZRS6k6lKeBfOJ18msWHF7Po8CK2HVzHwIVpPLRVSKtQggr/+5KalStftW7c/giOz/mIiqcXUFmENR7NcG32NA0at9KJY5VSSiml4exGiQi/7vqVBYcWsC96E3Wj7LQ94MGT+6242AxBw4YQMmwYxv0Kg/bTkzkR8SfnV42jYmIE7uLBMv/uFG47nGahNW99Z5RSSimVZ2k4u0HGGOJ+/pm+kacpE23DiOBa1A/fe1vj3707HtWqZa9gt8HB5Zxb/ysFomZT2H4BkUDmFhtGlU5P0rpEced0RCmllFJ5moazm9D5UBC4eeA77G58W7emQNWqlz8u6cJpWPct6RETcEs6iUU8mUMDUqv2pnWHXrT30znKlFJKKXV1Gs5uQplx47B4XSVcnTuCrB6NfeOPuNhSWGarzRyXgZRu1IP77qqCv5fOUaaUUkqp69NwdhOuGMxO7sa28jPMtinYRfjT1oTJBXrRskUz3mhYCl8Pt1vfUKWUUkrdtjSc/RsisH8R6au+wu3gYtJxZ5L1bhYH3kP35g35pVYx3F0vn9NMKaWUUup6NJzdjPRk2Pp7RiiL28s58edHax/2l76H/i1rc3+F4MvHoCmllFJK3QQNZzdKhNQvm1Agfj977GWYYH8ME9qDB1tUoWrRq082q5RSSil1M3I1nBlj2gOfAy7AOBF5/5L1BYCfgLrAGaCviEQ71o0CHgJswFMiMi8323o9doGPUroRZfelQr12PNOsHMX9PZ3ZJKWUUkrlQ7kWzowxLsBXQBsgBthgjJkhIjuzFHsIOCsiFYwx/YAPgL7GmGpAPyAUKAYsNMZUEhFbbrX3eiwWQ/f7hlMywAs/Lx3kr5RSSqnckZuj1usD+0TkgIikAb8B3S4p0w340fF6KnC3yRi01Q34TURSReQgsM+xPaeqXtxPg5lSSimlclVuhrPiwJEs72Mcy65YRkSsQDwQdIN1lVJKKaXyndt+vgdjzKPGmAhjTMSpU6ec3RyllFJKqf8kN8NZLFAyy/sSjmVXLGOMcQX8yLgx4EbqAiAiY0UkXETCQ0JCcqjpSimllFLOkZvhbANQ0RhT1hjjTsYA/xmXlJkBDHa87g0sFhFxLO9njClgjCkLVATW52JblVJKKaXyhFy7W1NErMaYJ4B5ZEylMV5Edhhj3gIiRGQG8D3wszFmHxBHRoDDUW4ysBOwAo87805NpZRSSqlbxWScqMofwsPDJSIiwtnNUEoppZS6LmPMRhEJv3T5bX9DgFJKKaVUfqLhTCmllFIqD9FwppRSSimVh2g4U0oppZTKQ/LVDQHGmFPAoRzcZDBwOge3p3KGHpe8S49N3qTHJe/SY5M33arjUlpELpukNV+Fs5xmjIm40l0Uyrn0uORdemzyJj0ueZcem7zJ2cdFL2sqpZRSSuUhGs6UUkoppfIQDWfXNtbZDVBXpMcl79Jjkzfpccm79NjkTU49LjrmTCmllFIqD9EzZ0oppZRSeYiGsyswxrQ3xuwxxuwzxrzo7PbcaYwxJY0xS4wxO40xO4wxTzuWBxpjFhhjohx/BziWG2PMF47jtdUYU8e5PcjfjDEuxpjNxphZjvdljTHrHJ//78YYd8fyAo73+xzryzi14fmcMcbfGDPVGLPbGLPLGNNIvzPOZ4x5xvF7bLsxZpIxxkO/M85hjBlvjDlpjNmeZdlNf0eMMYMd5aOMMYNzo60azi5hjHEBvgI6ANWAe40x1ZzbqjuOFXhWRKoBDYHHHcfgRWCRiFQEFjneQ8axquj48yjwza1v8h3laWBXlvcfAJ+KSAXgLPCQY/lDwFnH8k8d5VTu+RyYKyJVgFpkHCP9zjiRMaY48BQQLiLVARegH/qdcZYJQPtLlt3Ud8QYEwi8DjQA6gOvXwx0OUnD2eXqA/tE5ICIpAG/Ad2c3KY7iogcE5FNjtcJZPwjU5yM4/Cjo9iPQHfH627AT5JhLeBvjCl6a1t9ZzDGlAA6AeMc7w3QCpjqKHLpcbl4vKYCdzvKqxxmjPED7gK+BxCRNBE5h35n8gJXwNMY4wp4AcfQ74xTiMhyIO6SxTf7HWkHLBCROBE5Cyzg8sD3n2k4u1xx4EiW9zGOZcoJHKf1awPrgMIicsyx6jhQ2PFaj9mt8xkwErA73gcB50TE6nif9bPPPC6O9fGO8irnlQVOAT84LjmPM8Z4o98ZpxKRWOBj4DAZoSwe2Ih+Z/KSm/2O3JLvjoYzlWcZY3yAP4DhInI+6zrJuM1YbzW+hYwxnYGTIrLR2W1Rl3EF6gDfiEht4AL/XJ4B9DvjDI7LXd3ICM/FAG9y4SyLyhl56Tui4exysUDJLO9LOJapW8gY40ZGMPtVRKY5Fp+4eOnF8fdJx3I9ZrdGE6CrMSaajMv9rcgY5+TvuGQD2T/7zOPiWO8HnLmVDb6DxAAxIrLO8X4qGWFNvzPO1Ro4KCKnRCQdmEbG90i/M3nHzX5Hbsl3R8PZ5TYAFR1307iTMXhzhpPbdEdxjLH4HtglIp9kWTUDuHhnzGDgryzL73PcXdMQiM9ymlrlEBEZJSIlRKQMGd+LxSIyAFgC9HYUu/S4XDxevR3l88T/SvMbETkOHDHGVHYsuhvYiX5nnO0w0NAY4+X4vXbxuOh3Ju+42e/IPKCtMSbAcWa0rWNZjtJJaK/AGNORjLE1LsB4EXnHuS26sxhjmgIrgG38M7bpJTLGnU0GSgGHgHtEJM7xS+9LMi4XJAEPiEjELW/4HcQY0wJ4TkQ6G2PKkXEmLRDYDAwUkVRjjAfwMxljBuOAfiJywElNzveMMWFk3KjhDhwAHiDjP+D6nXEiY8ybQF8y7kLfDDxMxhgl/c7cYsaYSUALIBg4QcZdl39yk98RY8yDZPybBPCOiPyQ423VcKaUUkoplXfoZU2llFJKqTxEw5lSSimlVB6i4UwppZRSKg/RcKaUUkoplYdoOFNKKaWUykM0nCmlbjvGGJsxJtIYs90YM8UY43WNsi2MMY2zvJ9gjOl9tfJZyiXmVHuzbDPMMVXPxfdvGGOey+n9KKVubxrOlFK3o2QRCROR6kAaMPQaZVsAja+x/lYKAzper5BS6s6m4UwpdbtbAVQwxnQxxqxzPPh7oTGmsDGmDBnB7RnHmbZmjjp3GWNWG2MO3OBZtOeNMRuMMVsdk4pijCljjNlljPnOGLPDGDPfGOPpWFfPUTbSGPOR4wyfO/AW0NexvK9j89WMMUsdbXkqpz8cpdTtR8OZUuq25Xj+YAcyniaxEmjoePD3b8BIEYkGxgCfOs60rXBULQo0BToD719nH22BikB9Ms581TXG3OVYXRH4SkRCgXNAL8fyH4AhIhIG2ABEJA14Dfjd0ZbfHWWrAO0c23/d8VxZpdQdzPX6RZRSKs/xNMZEOl6vIONZrJWB3x0PL3YHDl6j/p8iYgd2GmMKX2dfbR1/Njve+5ARyg6T8VDri+3YCJQxxvgDviKyxrF8Ihkh8Gr+FpFUINUYcxIoTMaDzJVSdygNZ0qp21Gy46xUJmPMaOATEZnhePbnG9eon5q16nX2ZYD3ROTbS/ZX5pLt2ADP62zrem2xob+Xlbrj6WVNpVR+4QfEOl4PzrI8AfD9D9udBzxojPEBMMYUN8YUulphETkHJBhjGjgW9cvBtiil7gAazpRS+cUbwBRjzEbgdJblM4Eel9wQcMNEZD4ZlybXGGO2AVO5fsB6CPjOcenVG4h3LF9Cxg0AWW8IUEqpbIyIOLsNSimVrxhjfEQk0fH6RaCoiDzt5GYppW4TOrZBKaVyXidjzCgyfsceAu53bnOUUrcTPXOmlFJKKZWH6JgzpZRSSqk8RMOZUkoppVQeouFMKaWUUioP0XCmlFJKKZWHaDhTSimllMpDNJwppZRSSuUh/w8lHpCtWvBwwAAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "seed_idx = list(range (10, max_seeds +1, 10))\n",
+    "\n",
+    "plt.figure(figsize=(10,5))\n",
+    "\n",
+    "for i in range(len(data)):\n",
+    "    plt.plot(seed_idx, time_algo_cu[i], label = names[i])\n",
+    "\n",
+    "\n",
+    "plt.title('Runtime vs. Number of Seeds')\n",
+    "plt.xlabel('Number of Seeds')\n",
+    "plt.ylabel('Runtime')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4094"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del time_algo_cu\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test 3: Multi-seed versus Sequential\n",
+    "This test uses a single files since sequential execution is slow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading ./data/coPapersCiteseer.mtx...\n",
+      "\t434,102 nodes, 16,036,720 edges\n"
+     ]
+    }
+   ],
+   "source": [
+    "G = read_and_create('./data/coPapersCiteseer.mtx')\n",
+    "nodes = G.nodes().to_array().tolist()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rw_depth = 4\n",
+    "max_seeds = 100\n",
+    "num_nodes = G.number_of_nodes()\n",
+    "runtime_seq = [0] * max_seeds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sequential = so also get a single random seed\n",
+    "for i in range (max_seeds) :\n",
+    "    for j in range(i):\n",
+    "        seeds = random.sample(nodes, 1)\n",
+    "        t = run_rw(G, seeds, rw_depth)\n",
+    "        runtime_seq[i] = runtime_seq[i] + t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "runtime = [None] * max_seeds\n",
+    "\n",
+    "for i in range (max_seeds) :\n",
+    "    seeds = random.sample(nodes, i+1)\n",
+    "    t = run_rw(G, seeds, rw_depth)\n",
+    "    runtime[i] = t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbAAAAEWCAYAAAAHC8LZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA3DklEQVR4nO3dd5xU1f3/8ddnZndZOgpYEBFUUASkq4gFC2DBkhiNDaMmMc0Yf4nELho1aiyJvcSer72bxIKoKBa6CCgGUVFAVEA6bJv5/P44d5dhZWGBnb07u+/n47EPdsq99zN3h/uee+6Zc8zdERERyTWJuAsQERHZHAowERHJSQowERHJSQowERHJSQowERHJSQowERHJSQowyQlm1sHMVppZMu5a6iIz62hmbmZ5MW1/oJl9Gv2Njo2jhqiO083snbi2L7VLASabzczmmNma6KD1jZk9aGbNanDdh5bfdvev3L2Zu6dqYv3ZFh1I3cz+XOn+eWY2KJ6qsuovwG3R3+j5yg+a2X5m9p6ZLTOz783sXTPrX/tlSn2iAJMtdZS7NwN6Ab2BC+Mtp075HvizmTWPu5BNsZlncTsBH1WxvhbAf4Bbga2BHYArgOLNrVEEFGBSQ9z9G+BVQpBhZoPMbF7mczLPqszscjN70sweNrMVZvaRmfWLHvsX0AH4d3R29+fKTWRmNsbMroo+1a80s3+bWWsze8TMlpvZRDPrmLHt3c3stejT///M7IT1vQ4z+6mZTap03/8zsxej348ws4+jmueb2Xkb2C0zgfeBP1axrQfN7KqM2+vss2h/jTCzaWa2yszuM7NtzezlaPujzWyrSqs908y+NrMFmbWZWcLMLjCzz8xscbTvt44eK9+3Pzezr4A3qqj3l2Y2O9qHL5pZu+j+z4CdWfv3alRp0S4A7v6Yu6fcfY27j3L3aRnrPtPMZprZEjN71cx2ynisyr9d9Dd/MfqbTwB2yXjMzOzvZvZd9Ph0M+u+vtcmuUkBJjXCzNoDhwOzN2Gxo4HHgVbAi8BtAO4+HPiK6OzO3f9WxfInAsMJn+h3IYTFA4RP+TOBkVFtTYHXgEeBbaLl7jCzPdazzn8Du5lZ54z7To6WBbgP+JW7Nwe6U8XBPsOlwLnlYbEZjgMGE0LgKOBl4CKgLeH/7zmVnn8Q0BkYApyf0Qz7e+BY4ECgHbAEuL3SsgcCXYGhlYsws4OBa4ATgO2BLwl/O9x9F9b9e1U+s5oFpMzsITM7vHLomtkx0Wv6cfS6xgKPRY9t7G93O1AU1XRm9FNuCHBAtO9aRrUvrvzaJHcpwGRLPW9mK4C5wHdEoVFN77j7S9F1rX8BPTdx2w+4+2fuvoxwYP/M3Ue7exnwFKFJE2AYMMfdH3D3Mnf/AHgGOL7yCt19NfACcBJAFGS7EwIWoBTYw8xauPsSd5+yoQLdfSrhAHz+Jr62cre6+7fuPp9wYB/v7h+4exHwXMZrLHeFu69y9+mEMD8puv/XwMXuPi8KmMuBn1RqLrw8WnbNeuo4Bbjf3adEy18IDMg8y62Kuy8H9gMc+CewMDpr2jajtmvcfWb0t/sr0Cs6C6vyb2ehQ89xwGVR3TOAhzI2XQo0J/z9LFr/go3VK7lDASZb6tjobGQQ4UDRZhOW/Sbj99VA4SZef/k24/c167ld3qFkJ2BvM1ta/kM4IG9XxXofZe2B/2Tg+SjYIBwwjwC+NLO3zGxANeq8DPhNxgF7U1T3NZabm/H7l4SzLQj74LmM1z8TSAHbVrFsZe2i9QHg7isJZzM7bPwlQBQep7t7e8KZazvgHxm13ZxR2/eAReve0N+uLZC3ntdcvs03CGf1twPfmdk9Fq7HST2hAJMa4e5vAQ8CN0R3rQKalD8efVpuuymrrLHiwgHuLXdvlfHTzN1/U8XzXwPamlkvQpCVNx/i7hPd/RhCc9bzwJMb27i7fwI8C1xc6aF19hFVB+qm2DHj9w7A19Hvc4HDK+2DwujMrqLUDaz3a0KYABVNe62B+VUuUYVofzxICLLy2n5VqbbG7v4eG/7bLQTK1vOaM7d1i7v3BfYgNCWO2NR6pe5SgElN+gcw2Mx6Eq57FJrZkWaWD1wCVL64vyHfEjoG1IT/AF3MbLiZ5Uc//c2s6/qe7O6lhCbI6wnX014DMLMCMzvFzFpGz1kOpKtZwxXAGYTrfeWmAkeY2dZmth1w7qa/tB+41MyamFm3aHtPRPffBVxd3jnCzNpG156q6zHgDDPrFXXS+CuhOXPOxhaMOmH8KbpOipntSPhgMC6jtgujmjGzlmZW3rxb5d8uanp+Frg8es17AD/L2G5/M9s7ev+tIlwrq+7fS3KAAkxqjLsvBB4mXJNYBvwWuJfwKX0VMG8Di1d2DXBJ1Gy0oZ5+1alrBeGC/omEM4lvgOvYcKA+ChwKPBVdlyk3HJhjZssJ125OqWYNXxCu8zXNuPtfwIfAHGAUa8NmS7xF6EjzOnCDu4+K7r+ZcB1vVHTNchywd3VX6u6jCR1SngEWEDrNnFjNxVdE2xpvZquibc8A/hSt+znC3+PxaL/OIHQIqs7f7mxCM+o3hLO6BzK224JwzW0JoWlxMeFDidQTpgktRUQkF+kMTEREcpICTEREcpICTEREcpICTEREclIsUy9UpU2bNt6xY8e4yxARkTpi8uTJi9x9vd8hrVMB1rFjRyZNmrTxJ4qISINgZl9W9ZiaEEVEJCcpwEREJCcpwEREJCfVqWtg61NaWsq8efMoKiqKuxSphsLCQtq3b09+fn7cpYhIPVfnA2zevHk0b96cjh07YmZxlyMb4O4sXryYefPm0alTp7jLEZF6rs43IRYVFdG6dWuFVw4wM1q3bq2zZRGpFVk9AzOzOYSRqFNAmbv328z11GRZkkX6W4lIbamNM7CD3L3X5oaXiIjkmO8/h5cvgFRpVjdT55sQ64NmzcKs73PmzOHRRysm92XSpEmcc845G1x2zpw5dO/efYPPqQ3lr0FEpEorF8JLI+C2/jDlIVgwLauby3aAOWECvclmdtb6nmBmZ5nZJDObtHDhwiyXE6/KAdavXz9uueWWGCsSEakBJavhrevhll4w8T7oPRzO+QDa983qZrMdYPu5ex/C7Kq/M7MDKj/B3e9x937u3q9t2/UOdxWrOXPmsPvuu3P66afTpUsXTjnlFEaPHs3AgQPp3LkzEyZMAODyyy/nhhtuqFiue/fuzJkzZ511XXDBBYwdO5ZevXrx97//nTFjxjBs2LCK5YcPH86AAQPo3Lkz//znP39QSyqVYsSIEfTv358999yTu++++wfPWbVqFUceeSQ9e/ake/fuPPFEmOR38uTJHHjggfTt25ehQ4eyYMECAD777DMOO+ww+vbty/77788nn3wCwBdffMGAAQPo0aMHl1xyScX6FyxYwAEHHECvXr3o3r07Y8eO3YK9KyI5LZ2GD5+A2/rBm1fBzoPgt+PgqH9A8+2yvvmsduJw9/nRv9+Z2XPAXsDbm7u+K/79ER9/vbymygNgj3YtGHlUtw0+Z/bs2Tz11FPcf//99O/fn0cffZR33nmHF198kb/+9a88//zz1drWtddeyw033MB//vMfAMaMGbPO49OmTWPcuHGsWrWK3r17c+SRR67z+H333UfLli2ZOHEixcXFDBw4kCFDhqzTZf2VV16hXbt2/Pe//wVg2bJllJaW8vvf/54XXniBtm3b8sQTT3DxxRdz//33c9ZZZ3HXXXfRuXNnxo8fz29/+1veeOMN/vCHP/Cb3/yG0047jdtvv71i/Y8++ihDhw7l4osvJpVKsXr16mq9dhGpZ74aD69eCPMnw/Y94cf/hI4Da7WErAWYmTUFEu6+Ivp9CPCXbG0vmzp16kSPHj0A6NatG4cccghmRo8ePX5wlrUljjnmGBo3bkzjxo056KCDmDBhAr169ap4fNSoUUybNo2nn34aCOH06aefrhNgPXr04E9/+hPnn38+w4YNY//992fGjBnMmDGDwYMHA+FMbvvtt2flypW89957HH/88RXLFxcXA/Duu+/yzDPPADB8+HDOP/98APr378+ZZ55JaWkpxx577Dr1iUgDsPgzGH05zHwRmm8Px94Je54IidrvUpHNM7BtgeeibtV5wKPu/sqWrHBjZ0rZ0qhRo4rfE4lExe1EIkFZWRkAeXl5pNPpiudtznehKndBr3zb3bn11lsZOnRolevo0qULU6ZM4aWXXuKSSy7hkEMO4Uc/+hHdunXj/fffX+e5y5cvp1WrVkydOrVa9QAccMABvP322/z3v//l9NNP549//COnnXZaNV+hiOSs1d/DW3+DifdCsgAGXQT7ng0FTWMrKWuR6e6fu3vP6Kebu1+drW3VBR07dmTKlCkATJkyhS+++OIHz2nevDkrVqyoch0vvPACRUVFLF68mDFjxtC/f/91Hh86dCh33nknpaWha+qsWbNYtWrVOs/5+uuvadKkCaeeeiojRoxgypQp7LbbbixcuLAiwEpLS/noo49o0aIFnTp14qmnngJCQH744YcADBw4kMcffxyARx55pGL9X375Jdtuuy2//OUv+cUvflHxmkWknipdA+/8A27uBRPuhl4nwTlTYND5sYYX5MBQUrniuOOO4+GHH6Zbt27svffedOnS5QfP2XPPPUkmk/Ts2ZPTTz+d3r17/+Dxgw46iEWLFnHppZfSrl27dZoof/GLXzBnzhz69OmDu9O2bdsfXH+bPn06I0aMIJFIkJ+fz5133klBQQFPP/0055xzDsuWLaOsrIxzzz2Xbt268cgjj/Cb3/yGq666itLSUk488UR69uzJzTffzMknn8x1113HMcccU7H+MWPGcP3115Ofn0+zZs14+OGHa3Q/ikgdkU7D9Kfg9b/A8nnQeQgcegVsu0fclVUwd4+7hgr9+vXzyhNazpw5k65du8ZUUe25/PLLadasGeedd17cpWyxhvI3E6m35k6AVy5Y20Fj8JWw84GxlGJmk6saCENnYCIiEiz5MpxxzXgamm0XaweN6lCA1RGXX3553CWISEO1ciGMvSF8CTmRhANGwMBzoVHdHoFHASYi0lCVrIb3boH3bg2dNXqfCgeeDy13iLuyalGAiYg0NO4w/WkYPRKWz4euR8Mhl0GbznFXtkkUYCIiDcm8SfDKhTBvQuigcdy9sNO+cVe1WRRgIiINweLPQgeNj5+HptvA0bdBr5PDNa8cVTe7lsgmO+KII1i6dOkGn9OxY0cWLVpUOwWJSN2w+nt4+Xy4fW/4dBQceEH4InKf4TkdXqAzsHrjpZdeirsEEalLUqVh2Kcx10LxcuhzGgy6sFZGia8tOgPbiPVNT1LV1CSTJ0+mZ8+e9OzZkxEjRlRMRPnggw9y9tlnV6xz2LBhFSPRjxo1igEDBtCnTx+OP/54Vq5cCYSzpZEjR9KnTx969OhRMc3JypUrOeOMM+jRowd77rlnxYC7mWdXxx57LH379qVbt27cc889tbKfRKSOcIdPXoI7BoQvI7frDb9+F466uV6FF+TaGdjLF8A302t2ndv1gMOvrfLh9U1Pcvjhh693apIzzjiD2267jQMOOIARI0ZsdNOLFi3iqquuYvTo0TRt2pTrrruOm266icsuuwyANm3aMGXKFO644w5uuOEG7r33Xq688kpatmzJ9OlhPyxZsuQH673//vvZeuutWbNmDf379+e4446jdevWm7N3RCSXzHk3jBQ/bwK03hVOfjIMAbWegbnrg9wKsBhUnp5kq622Wu/UJEuXLmXp0qUccECYs3P48OG8/PLLG1z3uHHj+Pjjjxk4MMyhU1JSwoABAyoe//GPfwxA3759efbZZwEYPXp0xSC7AFtttdUP1nvLLbfw3HPPATB37lw+/fRTBZhIffbdJzDqEpj9Wpji5KibodcpkMyPu7Ksyq0A28CZUrZUnp7k4IMPXu/UJBvqQFHVVCvuzuDBg3nsscfWu1z5tC3JZLJi2paNGTNmDKNHj+b999+nSZMmDBo0aLOmdhGRHLBmKbx1HYy/O4yaMfhK2OuXkN847spqha6BbUTl6UnGjx+/3qlJWrVqRatWrXjnnXeAdacg6dixI1OnTiWdTjN37lwmTJgAwD777MO7777L7NmzgXC9bdasWRusZ/DgwevMkFy5CXHZsmVstdVWNGnShE8++YRx48Zt+U4QkbolVQaTHoBb+8K4O0MHjd9/AAPPaTDhBbl2BhaD9U1PkpeXt96pSR544AHOPPNMzIwhQ4ZUrGPgwIF06tSJPfbYg65du9KnTx8A2rZty4MPPshJJ51UMRPyVVddtd6pWMpdcskl/O53v6N79+4kk0lGjhxZ0dQIcNhhh3HXXXfRtWtXdtttN/bZZ58s7RkRqXXu8Ml/4fUrYNEs6DAADn82fCG5AdJ0KlkyZ84chg0bxowZM+Iupdbl6t9MpE6bOzFc55o7Dlp3hkMvh92PrLcdNMppOhURkVy1ZE7oWfjRc9BsWxj2D+g9HJI6fGsPZEnHjh0b5NmXiNSQNUtg7I2hg4Ylwyjx+55T56c4qU05EWDujtXz0+T6oi41SYvkpNIimHBPCK+iZWG8woMvgRbt4q6szqnzAVZYWMjixYtp3bq1QqyOc3cWL15MYWFh3KWI5B53mPFMaC5cNhd2HRyuc23XPe7K6qw6H2Dt27dn3rx5LFy4MO5SpBoKCwtp37593GWI5JavPwgjDc0dB9vtCcfcBjsPiruqOq/OB1h+fj6dOnWKuwwRkZq34lt440r44P+gaRs4+tYwgkaOjxJfW+p8gImI1Dslq+H92+Gdv0OqBPY9Gw4YAYUt464spyjARERqSzoN058ME0sunw9dj4JDr4DWu8RdWU5SgImI1IbZr8NrI+Hb6bB9L/jxP6HjwLirymkKMBGRbPpmehhB4/Mx0GonOO4+6PZjSGgo2i2lABMRyYaVC0MHjSkPQ+Ot4LBrod+ZkNco7srqDQWYiEhNKl0TRs94+wYoWwP7/BYOHBFCTGqUAkxEpCakyuDDx2DMNaGDRpfDYMjV0GbXuCurt7IeYGaWBCYB8919WLa3JyJSq1Z8C5/8Jwz/tPAT2KEv/Ohu6LR/3JXVe7VxBvYHYCbQoha2JSKSfekUTH4Qpj8NX70POLTtCic8DF2PrvdTnNQVWQ0wM2sPHAlcDfwxm9sSEakVqxbBMz8PvQq32QMGXQB7HANtd1dw1bJsn4H9A/gz0LyqJ5jZWcBZAB06dMhyOSIiW2DuRHjqZyHEjr4V+pwWd0UNWta+iGBmw4Dv3H3yhp7n7ve4ez9379e2bdtslSMisvnKSsKwTw8cDok8+PkohVcdkM0zsIHA0WZ2BFAItDCz/3P3U7O4TRGRmjVrFLxyAXz/Gew+LIwUry7xdULWAszdLwQuBDCzQcB5Ci8RyRnffQKvXQqfjoLWneGUZ6DzoXFXJRn0PTARkUzL5ofvck19BAqawZCrYK9fQV5B3JVJJbUSYO4+BhhTG9sSEdksRcvDda5xd4Ru8nv/GvY/D5q2jrsyqYLOwESkYUuVwZSH4M2/wupF0OMEOPhi2Kpj3JXJRijARKThmj0aXr04jKCx00AY8hTs0CfuqqSaFGAi0vAsmg2vXgSfvgpb7ww/fQR2P1JfRM4xCjARaTjWLAmjxI+/G/IKYfCV4VqXOmjkJAWYiNR/ZSUw8V54+2+wZin0PgUOGQnNtom7MtkCCjARqb/c4eMXYPRIWDIHdh4UusVv1yPuyqQGKMBEpH6aNzlc55o7Lgy6e8ozsOshus5VjyjARKR+WTIH3rgapj8JTbeBo26G3sMhkYy7MqlhCjARqR9WLoSxN8DE+0JY7X8e7HcuNKpyMgzJcQowEcltJavhvVvhvVugdA30PjXM0dWiXdyVSZYpwEQkN7mHGZFHj4Tl88NMyIdcBm06x12Z1BIFmIjknrkTQweNeRNg+55w3L2w075xVyW1TAEmIrlj0Wx4/QqY+SI02xaOvg16nQKJrM3NK3WYAkxE6r5Vi8MUJ5Puh/zGMOgiGPA7aNQs7sokRgowEam7UqWhV+GYv0LxSuh7euigoRE0BAWYiNRF7mEm5FGXwqL/wc4HwWHXwDZd465M6hAFmIjULV+Nh9GXw1fvhZHiT3wMdjtcI2jIDyjARKRuWPxZOOP633/DCBpH3gh9fgbJ/LgrkzpKASYi8SpaBm9fD+PugrxGcNAlsM9v1EFDNkoBJiLxSJXB1P+DN66CVYvCFCcHXwbNt427MskRCjARqV3uMOvVMILGwk9gx73h5Cdhhz5xVyY5RgEmIrVn/hR47TKYMxa23gVO+Bd0PUodNGSzKMBEJPuWfgWv/wWmPwVNWsMRN4TvdKmDhmwBBZiIZM+apfDOTaGDhhns/ycYeC4Utoi7MqkHFGAiUvNSpTDpgTD805ol0PMkOPgSaLlD3JVJPaIAE5Ga4w7/eylc51o8GzodAEOuCiPGi9QwBZiI1IyvP4BXL4Ev34HWneGkx6HLYeqgIVmjABORLbP0q/BdrmlPQJM2GkFDao0CTEQ2z+rvYeyNMOEesATs9//CT2HLuCuTBkIBJiKbpqw4hNbb10PRcuh1Mhx0EbRsH3dl0sBkLcDMrBB4G2gUbedpdx+Zre2JSJa5w0fPhZHil34Jux4Kh14B23WPuzJpoLJ5BlYMHOzuK80sH3jHzF5293FZ3KaIZMPcCfDqxTBvAmzTDYY/B7scHHdV0sBlLcDc3YGV0c386MeztT0RyYIlX4Yzro+ehWbbwtG3Qq9TIJGMuzKR7F4DM7MkMBnYFbjd3cdnc3siUkOKlsHYm2DcnaGDxoHnw77naIoTqVOyGmDungJ6mVkr4Dkz6+7uMzKfY2ZnAWcBdOjQIZvliMjGpEph8oNhBI3Vi2HPn8Ihl6mDhtRJmxRgZtbE3Vdv6kbcfamZvQkcBsyo9Ng9wD0A/fr1UxOjSBzcYdYrYQSNRbNgp/1g6FXQrnfclYlUKVGdJ5nZvmb2MfBJdLunmd2xkWXaRmdemFljYHD58iJSh8yfAg8Og8dOBE/DiY/B6f9ReEmdV90zsL8DQ4EXAdz9QzM7YCPLbA88FF0HSwBPuvt/NrtSEalZS+fC61dEU5y00RQnknOq3YTo7nNt3THNUht5/jRAH+FE6priFfDO3+H928Pt/c+DgX/QFCeSc6obYHPNbF/Ao+90/QGYmb2yRKTGpUrhg3/Bm9fAqu+gxwlw6Eh10JCcVd0A+zVwM7ADMB8YBfwuW0WJSA1Kp8P3uN68Gr7/HHbcJ4wU375v3JWJbJFqBZi7LwJOyXItIlLT5rwDr1wA30wPI2ic9AR0GaopTqReqFaAmVkn4PdAx8xl3P3o7JQlIltk2Xx47VKY8Qy03BF+/E/o/hNIVKvjsUhOqG4T4vPAfcC/gXTWqhGRLVOyGsbdEUbRSJeFETQGngsFTeKuTKTGVTfAitz9lqxWIiKbL52CqY+G61wrFsDuw2DIVbB1p7grE8ma6gbYzWY2ktB5o7j8TnefkpWqRKR63OF/L8Prf4GFM2GHfvCT+2GnfeOuTCTrqhtgPYDhwMGsbUL06LaIxOGLt0NwzZsIW+8CJzwMXY9WBw1pMKobYMcDO7t7STaLEZFqWDgr9Cz87HVosQMcdUuYFVkjaEgDU90AmwG0Ar7LXikiskHFK+Ctv4VOGvlNYcjV0P8XkF8Yd2UisahugLUCPjGziax7DUzd6EWyLZ2CaU+GcQtXLIDep8Ihl0OztnFXJhKr6gbYyKxWISI/5A6fvhZmRP7uI2jXB376f9C+X9yVidQJ1R2J461sFyIiGRZMg1cvgjljYatO8JMHYI9j9UVkkQwbDDAze8fd9zOzFYRehxUPAe7uGr5apCatWgRvXAVTHoLCVnD49WGKk7yCuCsTqXM2GGDuvl/0b/PaKUekgSorgYn3wlvXQvFK2OtXMOh8aLxV3JWJ1FnVHQvxX+4+fGP3icgmcoePngsdNJbMgV0OhqHXwDa7x12ZSJ1X3U4c3TJvmFkeoLkYRLbEl+/DqEtg/qQwUvypz8Cuh8ZdlUjO2Ng1sAuBi4DGZra8/G6gBLgny7WJ1E+LP4PRI2Hmv6H59nDM7dDzJEgk465MJKds7BrYNcA1ZnaNu19YSzWJ1E+rFsPbfwvXupKN4KBLYMBvoaBp3JWJ5KTqdqO/0Mx2AHZi3fnA3s5WYSL1RmkRTLgb3r4RSlZAn9Ng0EXQfNu4KxPJadXtxHEtcCLwMZCK7nZAASZSleKVMO1xeOdmWPYVdB4Kg6+AbbrGXZlIvVDdThw/AnZz9+KNPlOkoVv8GUz4J0x9BIqXhxE0jrkVdh4Ud2Ui9Up1A+xzIJ+McRBFpJJUKYy9Ed6+HjDodmz4Plf7fpriRCQLqhtgq4GpZvY66w7me05WqhLJNd9+BM/9Gr6ZBj1OgCFXQvPt4q5KpF6rboC9GP2ISKaSVfDuzTD2JihsGQbb7XpU3FWJNAjV7YX4ULYLEckp6TRMeyLMiLzia+j+Ezj8OmjaJu7KRBqM6vZC/IJ1B/MFwN13rvGKROq6Oe/CqIvh6w9CB43jH4AO+8RdlUiDU90mxMwJiAqB44Gta74ckTps0afw2kj433+hxQ7wo3ugx/Ga4kQkJtVtQlxc6a5/mNlk4LKaL0mkjlm1CMZcC5Puh/wmcMhlsM9vIb9x3JWJNGjVbULsk3EzQTgjq+7Zm0huKiuG8XfB2zeEzhp9T4dBF0KztnFXJiJUP4RuzPi9DJhDaEYUqX/Safjo2dBBY+mXYQSNIVdC293irkxEMlS3CfGgzNtmliQMLTWrqmXMbEfgYWBbQgeQe9z95s0vVaQWfP4WvHYZLJgK23aH4c+FObpEpM7Z2HQqLYDfATsALwCjo9t/AqYBj2xg8TLgT+4+xcyaA5PN7DV3/7hGKhepSd99Eubmmv0atNwRfnR3+EKyOmiI1FkbOwP7F7AEeB/4JXAxYT6wH7n71A0t6O4LgAXR7yvMbCYhCBVgUnes/j500Jh4LxQ0g8FXwl5nQX5h3JWJyEZsLMB2dvceAGZ2LyGQOrh70aZsxMw6Ar2B8et57CzgLIAOHTpsympFNl9ZCUy6L4RX8XLod2aY4qRp67grE5Fq2liAlZb/4u4pM5u3GeHVDHgGONfdl1d+3N3vIZrduV+/fj/4srRIjXKHj1+A0ZfDki9g54Ng6F9h2z3irkxENtHGAqynmZWHjgGNo9sGuLu32NDCZpZPCK9H3P3ZLa5WZEt8NR5euxTmjodt9oBTn4FdD427KhHZTBsMMHdPbu6KzcyA+4CZ7n7T5q5HZIst/iyccc18EZptB0ffCr1OgcRmv71FpA7I5peRBwLDgelmNjW67yJ3fymL2xRZa/X38NZ1oYNGslG4xrXv2VDQNO7KRKQGZC3A3P0dQlOjSO1KlYZhn978a+ig0ee0EF7Nt427MhGpQRoOSuoPd5j1Shhwd9H/oNOBcNg1sG23uCsTkSxQgEn9MG8SjLoUvnoPtt4FTnocuhwGpkYAkfpKASa57fsvQgeNj5+Hpm3hyBuhz88gmR93ZSKSZQowyU1Fy8Io8ePvgkQeHHg+7Pt7aNQ87spEpJYowCS3pMpgyoPw5jWwehH0PBkOuRRatIu7MhGpZQowyQ3uMOvV8EXkRbNgp4Ew9Glo1zvuykQkJgowqfsWfBg6aHzxFrTeFU58FHY7Qh00RBo4BZjUXUvnwhtXwbQnoPFWcPjfwqC76qAhIijApC4qWgbv/B3evyPcHvgH2P+PUNgy3rpEpE5RgEndkSqDyQ+EKU5WL4I9fwoHXwqtdoy7MhGpgxRgEr8fdNDYD4ZepQ4aIrJBCjCJ14JpMOpi+OLtMIKGOmiISDUpwCQeS+eGwXY/fEwdNERksyjApHat/h7euQnG3wN4mN5k//Ogcau4KxORHKMAk9pRVgIT7oa3rg9TnPQ6GQZdqA4aIrLZFGCSXeVTnLx6MXz/Gew6GAZfoSlORGSLKcAke779CEZdAp+9AW26wCnPQOdD465KROoJBZjUvOUL4M2rYeoj0KgFHHYt9P+FOmiISI1SgEnNKVkF794C790CqVLY57dwwHmhl6GISA1TgMmWS6fDeIWv/wVWfA17HAuHXg5bd4q7MhGpxxRgsnlWfAvzJ8H8yfDpKPhmehg54yf3w04D4q5ORBoABZhsmkWz4cWz4av3w+1EHmzbHX50N/Q4ARKJeOsTkQZDASbVk07DxHvhtcsgr1FoIuywL2y/J+Q3jrs6EWmAFGCyccu/hud+HSaU3HUwHH0rtNg+7qpEpIFTgMmGff4WPPNzKFkNR90MfX6mgXZFpE5QgMn6pdNhzMI3r4bWneH0/0Lb3eKuSkSkggJMfmjhLHjlAvjsdej+k3Dm1ahZ3FWJiKxDASZrrfgWxlwDUx6G/CZwxA1hBA01GYpIHaQAkzCCxnu3hlE0UsUhtA78MzRtE3dlIiJVUoA1ZOlUmFDyjatgxQLY4xg4ZCS03iXuykRENiprAWZm9wPDgO/cvXu2tiOb6fMxYaT4b6bDDv3g+Aehwz5xVyUiUm3ZPAN7ELgNeDiL25BNtfB/MOpS+PRVaNkBjrsPuh+n61wiknOyFmDu/raZdczW+mUTrVwYOmhMfhAKmsKhV8Dev4b8wrgrExHZLLFfAzOzs4CzADp06BBzNfVQaRGMvxPG3hQ6a/Q7EwZdoA4aIpLzYg8wd78HuAegX79+HnM59Yc7zHgGRl8By76CLofD4L9A2y5xVyYiUiNiDzDJgq/GwasXhalOtusBx7wIOx8Yd1UiIjVKAVafLP4MXr8CPn4Bmm8Px9wBPU+ERDLuykREalw2u9E/BgwC2pjZPGCku9+Xre01aKsWw9t/g4n3QbIABl0I+/4+dNYQEamnstkL8aRsrVsipUUw/i4YeyOUrIQ+p4Xwar5d3JWJiGSdmhBzUToNM56G1/8Cy+ZC56Ew+ArYpmvclYmI1BoFWK758r3QQePrD2C7PeGY29VBQ0QaJAVYrvj+C3jtMpj5IjRvB8feBXv+FBKJuCsTEYmFAqyuW7MkXOMafzck8uCgi2HA2VDQJO7KRERipQCrq8qKYeK98NbfoGgZ9DoFDr4EWmwfd2UiInWCAqyucYePngvf51oyB3Y5OIygsV2PuCsTEalTFGB1yVfjwhQn8ybCNt3g1Gdg10PjrkpEpE5SgNUF63TQ2B6Ovg16nawRNERENkABFqfiFaGDxvu3hw4agy6Cfc/WCBoiItWgAItDqgym/h+8+VdY+S3seSIcOhJatIu7MhGRnKEAq03uMPPfYQSNxZ9C+73gxEehfb+4KxMRyTkKsNry1XgYdXHooNFmtxBcux0BZnFXJiKSkxRg2bbkSxg9MnSNL++g0fMkSGrXi4hsCR1Fs6VoWeigMe4usAQceAEMPEcdNEREaogCrKalSmHS/TDm2jAMVM8T4ZDL1EFDRKSGKcBqijvMeiV8EXnxbOh0AAy5CrbvGXdlIiL1kgKsJnz7EbxyIXzxFrTpAic/CZ2HqIOGiEgWKcC2xIpvYMw1MOVhKGwJh18P/c6AZH7clYmI1HsKsM1RvALevQXevy1c89rrLDjwfGiyddyViYg0GAqwTZEqhSkPhQ4aqxZCtx/DIZfC1jvHXZmISIOjAKuO8g4ar10Gi2bBTvvByU/ADn3jrkxEpMFSgG3M11NDz8I5Y6H1rnDiY7Db4eqgISISMwVYVZbNhzeuhA8fD9e21EFDRKROUYBVVrwC3r0Z3rsNPB1Gz9j/T6GXoYiI1BkKsHLpFHzwL3jjalj1HXQ/Dg4ZCVvtFHdlIiKxSqedFUVlLFldwqqSMtaUpFhTmqI0lcbMSJiRNKMklWJVcYrVJWWsKk5xXJ/2tGySvVYrBRjA7Ndh1KXw3Uew495w0mOa4kREtlhJWZpVxWWsKU3hgBEun7tD2p10GsrSaZatKWXp6lKWrC5hTWmKwrwkjQuSNM5PUlSaYumaUpatKWVVcRnJhJGfTJCfNAwj5U7anVTKWV2aYlVxCI+VxaUsX1PG8qJSVhSVkZ80mhXm07xRHoX5CUpSTnFpipJUmlTa19bkUJZKU5Z2ytJpVheH7afSvsmvf99dWyvAsua7maGDxuzR0GonOP5B2ONYddAQqQVlqTQrispYVVJGcVmakrI0xWXpioNn+UG1MD9BYX6SwvwEAKUppywVDq5laac0la64XVLmlKTSlJalw4E9HQ7IxWUplq4OIbB8TSmlacd97QE5L2EkEwnyEsaa0hRLVpewZHUJy9eUUZCXoHG0/WTCKEuFbZamQo2l6RAA5feX11RcmqYkla7VfZqXMJo2yqNZozyaNkrSsnE+27UopPM2eZSmnZVFZawsLuP7VSUU5CUoyEvQrFEeyUQ4i0oYgJGfNPKSCfITRuOCJFs1KaBVk3y2alJA00Z5NC5I0qQgSUEyEYWek0pDQV6CpgVJmjTKo2lBkuaF2e0z0DADbOV3YTbkKQ9BQfMwZuFeZ0Feo7grE6mWdNopLkuzuqSM1SUpisvSmEHCDAOcEBAl0cG9uCzNmtIURVGzT/mx26N1laV9neAI/6bXCYtwwI7uS5eHRliutHz5lFf8XppKU5JySsvC7+UH+9IyZ0VRKatKUrW+35oX5tGycT4FyRCG5Tsr5WtfZ2F+OGC3bdaIXdo2oyzlrClNsaYkRSrtNMpP0LRRXjjIJxIkk0Z+FIDhwB/uL8xP0qxRMhzw85MVZ17lElFoJBPQsnEIh62aFNC4IJx1lW+zMD8EUasm+TTOT+IOpekQ+EC0jrCugrxEre/TODWsACstgnF3wNiboGwN9P9lGEGjaeu4K5Ma4B4O1EWlKQwjkYBkwnCn4kC79uC87qflskoH5fL7ispSFJWGdaaiT+0OpB1WFpWFT/RFpZSUpSs+0RYkE7iXH9zDulaXpFhdmmJNSRmptEefdsOZ/qqS8Kl4RVEZpWVpEgkjL2EkElaxnlTKM8IlzWa05mwRM8hPJMhLWkUTVl70bzIRDtrlj5d/ci/IS9CkIBzUy59XvlyLxvm0KMynZeM8mjTKo1FegkbR/ssrX08iHIyLouAtig7Y+YmwjbzE2rAoP2MoSCYoyFu7vfKDe0EyQfPCPPKSuX+AN4NGiSSN8pJxlxK7hhFgJavgg0fgvVtg2dwwE/Lgv0CbznFXljUetWWXH6QzP12XZhzI1zaHpCtulzfNlKXWfrqu/Gm8LKP5JPOT+A/XsfZTeWlZuuL+VPnBOOO5qbSTl0zQKJmgUX6ChNkPQqWkbO1ZRfk6Uu4UlaZYHX1Crk1NC8Kn44K8REVtxWVpEmYVB9j8ZIImBUkaF+TRJD9JYb5VXP9wnO1aFNK8MI9mhXnkJxOko9dUHnTlYZaXceBOmFGYn4zWm6RR9Mm7/DqGGWsP7Im1TXCF+UkK8hIV12IgfIKvCKJoG8mEVdwuf0ykrqnfAbbiW5hwN0y8D4qWwg794JjbYecDK55S/om2LDo4l7dnZ34qr9zmvs5BN/M5VTS5lD+vPAhCaEShUPHYD9vR1w2F8rrW1ltaaV0VQRJtu7ZlfiKu+Ddh5EefiCs+oUcH5KSF5zfND23wSTNK005JWYqVxWWko0DLSxiF+eETdPiEnSA/mahofkkmjEZ5SZo2StKkIDTXlDeNpaM2m/L1hLOAtdc7ym+X15xf8ck+HPwL85MU5oWDf/mndyMc9Js2StaLT/QiuSqrAWZmhwE3A0ngXne/Npvbe/6D+dz25mzSqRSHl7zKb0sfojFFjKE/D3EUk+Z2Jv3AGlL+ckXTjNfycT7zIJ958Fx7YM1slklEbetGk4K8ioN/VQfbzE/Q5W3y4Xlrl0kmrMqmoLwqnpd5wF/7PFsnFPIShqnzi4jUoqwFmJklgduBwcA8YKKZvejuH2drm62a5DNwq+Wc+t0NdC6dyqymffl3+z+ypPFOdEok2CX6xG8GSSs/8K57QM6vdPAuD4O8jAN3ZsjkZwZRFe3yeYm1y+kgLyJSM7J5BrYXMNvdPwcws8eBY4CsBdigkrcZNP/sMNzTUbfQpc9p/EmBISJSL2UzwHYA5mbcngfsXflJZnYWcBZAhw4dtmyLrXeBXQ6GI66Hljts2bpERKROi/0KtLvf4+793L1f27Ztt2xl7XrDSY8qvEREGoBsBth8YMeM2+2j+0RERLZYNgNsItDZzDqZWQFwIvBiFrcnIiINSNaugbl7mZmdDbxK6EZ/v7t/lK3tiYhIw5LV74G5+0vAS9nchoiINEyxd+IQERHZHAowERHJSQowERHJSQowERHJSea1PZrtBpjZQuDLzVi0DbCohsupL7RvNkz7Z8O0fzZM+6dqNbVvdnL39Y5yUacCbHOZ2SR37xd3HXWR9s2Gaf9smPbPhmn/VK029o2aEEVEJCcpwEREJCfVlwC7J+4C6jDtmw3T/tkw7Z8N0/6pWtb3Tb24BiYiIg1PfTkDExGRBkYBJiIiOSmnA8zMDjOz/5nZbDO7IO564mZmO5rZm2b2sZl9ZGZ/iO7f2sxeM7NPo3+3irvWuJhZ0sw+MLP/RLc7mdn46D30RDT1T4NkZq3M7Gkz+8TMZprZAL131jKz/xf9v5phZo+ZWWFDfv+Y2f1m9p2Zzci4b73vFwtuifbTNDPrUxM15GyAmVkSuB04HNgDOMnM9oi3qtiVAX9y9z2AfYDfRfvkAuB1d+8MvB7dbqj+AMzMuH0d8Hd33xVYAvw8lqrqhpuBV9x9d6AnYT/pvQOY2Q7AOUA/d+9OmCLqRBr2++dB4LBK91X1fjkc6Bz9nAXcWRMF5GyAAXsBs939c3cvAR4Hjom5pli5+wJ3nxL9voJwANqBsF8eip72EHBsLAXGzMzaA0cC90a3DTgYeDp6SkPeNy2BA4D7ANy9xN2XovdOpjygsZnlAU2ABTTg94+7vw18X+nuqt4vxwAPezAOaGVm229pDbkcYDsAczNuz4vuE8DMOgK9gfHAtu6+IHroG2DbuOqK2T+APwPp6HZrYKm7l0W3G/J7qBOwEHggamK918yaovcOAO4+H7gB+IoQXMuAyej9U1lV75esHK9zOcCkCmbWDHgGONfdl2c+5uF7Ew3uuxNmNgz4zt0nx11LHZUH9AHudPfewCoqNRc21PcOQHQt5xhC0LcDmvLD5jPJUBvvl1wOsPnAjhm320f3NWhmlk8Ir0fc/dno7m/LT9ejf7+Lq74YDQSONrM5hObmgwnXfFpFTULQsN9D84B57j4+uv00IdD03gkOBb5w94XuXgo8S3hP6f2zrqreL1k5XudygE0EOke9gAoIF1RfjLmmWEXXdO4DZrr7TRkPvQj8LPr9Z8ALtV1b3Nz9Qndv7+4dCe+VN9z9FOBN4CfR0xrkvgFw92+AuWa2W3TXIcDH6L1T7itgHzNrEv0/K98/ev+sq6r3y4vAaVFvxH2AZRlNjZstp0fiMLMjCNc1ksD97n51vBXFy8z2A8YC01l7neciwnWwJ4EOhOlqTnD3yhdfGwwzGwSc5+7DzGxnwhnZ1sAHwKnuXhxjebExs16EDi4FwOfAGYQPuXrvAGZ2BfBTQm/fD4BfEK7jNMj3j5k9BgwiTJvyLTASeJ71vF+i0L+N0Oy6GjjD3SdtcQ25HGAiItJw5XITooiINGAKMBERyUkKMBERyUkKMBERyUkKMBERyUkKMGlQzMzN7MaM2+eZ2eU1tO4HzewnG3/mFm/n+Gi0+Dcr3Z+IRvyeYWbTzWyimXWqge11zBxxXKSuUIBJQ1MM/NjM2sRdSKaM0Ryq4+fAL939oEr3/5QwzNGe7t4D+BGwtGYqFKl7FGDS0JQB9wD/r/IDlc+gzGxl9O8gM3vLzF4ws8/N7FozO8XMJkRnOrtkrOZQM5tkZrOi8RfL5yC7PjojmmZmv8pY71gze5EwqkPlek6K1j/DzK6L7rsM2A+4z8yur7TI9sACd08DuPs8d18SLTfEzN43sylm9lQ0XiZm1jd6bZPN7NWMYYD6mtmHZvYh8LuMmrpFr3tq9Fo6b9LeF6lBCjBpiG4HTommEKmunsCvga7AcKCLu+9FGLni9xnP60iY6udI4C4zKyScMS1z9/5Af+CXGU17fYA/uHuXzI2ZWTvCXFMHA72A/mZ2rLv/BZgEnOLuIyrV+CRwVBQuN5pZ72hdbYBLgEPdvU+0/B+jcTNvBX7i7n2B+4Hy0WweAH7v7j0rbePXwM3u3gvoRxhDUSQWm9JsIVIvuPtyM3uYMEHhmmouNrF87DYz+wwYFd0/HchsynsyOgP61Mw+B3YHhgB7ZpzdtSRM7FcCTHD3L9azvf7AGHdfGG3zEcJ8Xc9v4HXNi8YyPDj6ed3MjgcaEyZ9fTeM6EMB8D6wG9AdeC26PwksMLNWQKtovieAfxEmJCRa7mILc6s96+6fVrnHRLJMASYN1T+AKYQzjXJlRK0SZpYgHOjLZY5vl864nWbd/0eVx2ZzwAhnM69mPhCNybhqc4qvSjQO38vAy2b2LWFCwVHAa+5+UqXt9wA+cvcBle5vtYH1P2pm4wlnmC+Z2a/c/Y2afA0i1aUmRGmQogFpn2TdKeDnAH2j348G8jdj1cdHvQF3AXYG/ge8CvwmarLDzLpYmCxyQyYAB5pZGzNLAicBb21oATPrEzU9lgfwnoQBVccBA81s1+ixpmbWJaqtrZkNiO7PN7Nu0UzMS6PBoQFOydjGzsDn7n4LYaTxPau3W0RqngJMGrIbCSNpl/snITQ+BAaweWdHXxHC52Xg1+5eRLhO9jEwJeqOfjcbaf2ImisvIEzX8SEw2d03NlXHNsC/o21MI5xR3hY1Q54OPGZm0wjNgLu7ewlhKpDrotc8Fdg3WtcZwO1mNpVwBlnuBGBGdH934OGN1CSSNRqNXkREcpLOwEREJCcpwEREJCcpwEREJCcpwEREJCcpwEREJCcpwEREJCcpwEREJCf9fyf+/1JKxIZnAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 504x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "seed_idx = list(range(1,max_seeds +1))\n",
+    "\n",
+    "\n",
+    "plt.figure(figsize=(7,4))\n",
+    "plt.plot(seed_idx, runtime, label = \"multiple seeds\")\n",
+    "plt.plot(seed_idx, runtime_seq, label = \"sequential\")\n",
+    "\n",
+    "\n",
+    "plt.title('Runtime vs. Number of Seeds')\n",
+    "plt.xlabel('Number of Seeds')\n",
+    "plt.ylabel('Runtime')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----\n",
+    "Copyright (c) 2021, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_dev",
+   "language": "python",
+   "name": "cugraph_dev"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/cugraph_benchmarks/random_walk_perf.ipynb b/notebooks/cugraph_benchmarks/random_walk_perf.ipynb
new file mode 100644
index 00000000000..738298767c5
--- /dev/null
+++ b/notebooks/cugraph_benchmarks/random_walk_perf.ipynb
@@ -0,0 +1,621 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Random Walk Performance\n",
+    "# Skip notebook test¶ \n",
+    "\n",
+    "Random walk performance is governed by the length of the paths to find, the number of seeds, and the size or structure of the graph.\n",
+    "This benchmark will use several test graphs of increasingly larger sizes.  While not even multiples in scale, the four test graphs should give an indication of how well Random Walk performs as data size increases.  \n",
+    "\n",
+    "### Test Data\n",
+    "Users must run the _dataPrep.sh_ script before running this notebook so that the test files are downloaded\n",
+    "\n",
+    "| File Name              | Num of Vertices | Num of Edges |\n",
+    "| ---------------------- | --------------: | -----------: |\n",
+    "| preferentialAttachment |         100,000 |      999,970 |\n",
+    "| dblp-2010              |         326,186 |    1,615,400 |\n",
+    "| coPapersCiteseer       |         434,102 |   32,073,440 |\n",
+    "| as-Skitter             |       1,696,415 |   22,190,596 |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  Import the modules\n",
+    "import cugraph\n",
+    "import cudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# system and other\n",
+    "import gc\n",
+    "import os\n",
+    "import time\n",
+    "import random\n",
+    "\n",
+    "# MTX file reader\n",
+    "from scipy.io import mmread\n",
+    "\n",
+    "import networkx as nx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try: \n",
+    "    import matplotlib\n",
+    "except ModuleNotFoundError:\n",
+    "    os.system('pip install matplotlib')\n",
+    "\n",
+    "import matplotlib.pyplot as plt; plt.rcdefaults()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try: \n",
+    "    import pybind11\n",
+    "except ModuleNotFoundError:\n",
+    "    os.system('pip install pybind11')\n",
+    "    \n",
+    "import pybind11"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try: \n",
+    "    import walker\n",
+    "except ModuleNotFoundError:\n",
+    "    os.system('pip install graph-walker')\n",
+    "\n",
+    "import walker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test File\n",
+    "data = {\n",
+    "    'preferentialAttachment' : './data/preferentialAttachment.mtx',\n",
+    "    'dblp'                   : './data/dblp-2010.mtx',\n",
+    "    'coPapersCiteseer'       : './data/coPapersCiteseer.mtx',\n",
+    "    'as-Skitter'             : './data/as-Skitter.mtx'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read the data and create a graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data reader - the file format is MTX, so we will use the reader from SciPy\n",
+    "def read_data(datafile):\n",
+    "    print('Reading ' + str(datafile) + '...')\n",
+    "    M = mmread(datafile).asfptype()\n",
+    "\n",
+    "    _gdf = cudf.DataFrame()\n",
+    "    _gdf['src'] = M.row\n",
+    "    _gdf['dst'] = M.col\n",
+    "    _gdf['wt'] = 1.0\n",
+    "    \n",
+    "    return _gdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_cu_ugraph(_df):\n",
+    "    _g = cugraph.Graph()\n",
+    "    _g.from_cudf_edgelist(_df, source='src', destination='dst', edge_attr='wt', renumber=False)\n",
+    "    return _g"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_nx_ugraph(_df):\n",
+    "    _gnx = nx.from_pandas_edgelist(_df, source='src', target='dst', edge_attr='wt', create_using=nx.Graph)\n",
+    "    return _gnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define the call to RandomWalk\n",
+    "We are only interested in the runtime, so throw away the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_cu_rw(_G, _seeds, _depth):\n",
+    "    t1 = time.time()\n",
+    "    _, _ = cugraph.random_walks(_G, _seeds, _depth)\n",
+    "    t2 = time.time() - t1\n",
+    "    return t2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_wk_rw(_G, _seeds, _depth):\n",
+    "    t1 = time.time()\n",
+    "    _ = walker.random_walks(_G, n_walks=1, walk_len=_depth, start_nodes=_seeds)\n",
+    "    t2 = time.time() - t1\n",
+    "    return t2   \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test 1: Runtime versus path depth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading ./data/preferentialAttachment.mtx...\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=2.23s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=2.48s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=2.02s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=2.31s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=2.01s\n",
+      "update i\n",
+      "Reading ./data/dblp-2010.mtx...\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=4.21s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=4.03s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=3.59s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=3.95s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=3.68s\n",
+      "update i\n",
+      "Reading ./data/coPapersCiteseer.mtx...\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=59.64s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=49.43s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=47.45s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=54.66s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=46.96s\n",
+      "update i\n",
+      "Reading ./data/as-Skitter.mtx...\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=53.14s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=44.36s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=46.38s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=41.96s\n",
+      "\tcuGraph\n",
+      "\tWalkerRandom walks - T=53.18s\n",
+      "update i\n"
+     ]
+    }
+   ],
+   "source": [
+    "# some parameters\n",
+    "max_depth = 6\n",
+    "num_seeds = 100\n",
+    "\n",
+    "# arrays to capture performance gains\n",
+    "names = []\n",
+    "\n",
+    "# Two dimension data\n",
+    "time_algo_cu = []       # will be two dimensional\n",
+    "time_algo_wk = []       # will be two dimensional\n",
+    "\n",
+    "i = 0\n",
+    "for k,v in data.items():\n",
+    "    time_algo_cu.append([])\n",
+    "    time_algo_wk.append([])\n",
+    "    \n",
+    "    # Saved the file Name\n",
+    "    names.append(k)\n",
+    "\n",
+    "    # read data\n",
+    "    gdf = read_data(v)\n",
+    "    pdf = gdf.to_pandas()\n",
+    "    \n",
+    "    # Create the Graphs\n",
+    "    Gcg = create_cu_ugraph(gdf)\n",
+    "    Gnx = create_nx_ugraph(pdf)\n",
+    "        \n",
+    "    num_nodes = Gcg.number_of_nodes()\n",
+    "    nodes = Gcg.nodes().to_array().tolist()\n",
+    "\n",
+    "    seeds = random.sample(nodes, num_seeds)\n",
+    "\n",
+    "    for j in range (2, max_depth+1) :\n",
+    "        print(\"\\tcuGraph\")\n",
+    "        tc = run_cu_rw(Gcg, seeds, j)\n",
+    "        time_algo_cu[i].append(tc)\n",
+    "        \n",
+    "        print(\"\\tWalker\", end='')\n",
+    "        tw = run_wk_rw(Gnx, seeds, j)\n",
+    "        time_algo_wk[i].append(tw)\n",
+    "\n",
+    "    # update i\n",
+    "    i = i + 1\n",
+    "    print(\"update i\")\n",
+    "    \n",
+    "    del Gcg\n",
+    "    del Gnx\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl4AAAFNCAYAAADRi2EuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAACN0ElEQVR4nOzdd3hUxf748ffsphcgQIBQEyAJ6QlJ6CACAopgQYoVrGDv2Lv+roUrKnq/qAjoVRABEfUiIgoSejO0BAgl9B5Ib7s7vz/O5pCQEIKShPJ5Pc8+u2fPzJw5Z9tn58yZUVprhBBCCCFE9bPUdgWEEEIIIS4XEngJIYQQQtQQCbyEEEIIIWqIBF5CCCGEEDVEAi8hhBBCiBoigZcQQgghRA2RwEuIi4hSqqVSKkcpZa3tutQ2pdQipdQ9tV2Pc6GUelUp9fU55pmmlLq+mqpU65RSI5VSS6qQ7t9Kqftrok5CVCcJvIT4h5RS6UqpfGdAdEgpNUUp5XMey+5Tsqy13qO19tFa289H+TXNeWyKnMcqQyn1m1KqXRXynXPAclr+nkqpfX83f21tUykVDcQAc5zLAUqpH5VSB5RSWikVeFp6d6XUJKVUlvO9+MRp63srpbYopfKUUguVUq0q2XY3pdQypVSm87VaqpRK/Cf78w+NBZ5XSrnVYh2E+Mck8BLi/BiotfYBYoE44Lnarc4F7V3nsWoOHAGm1G51LmijgG/0qZGuHcA8YPAZ0r8KBAOtgCuBMUqp/gBKqYbA98BLQH1gDTC9okKUUnWAn4HxzrTNgNeAwn+8R3+T1vogsAUYVFt1EOJ8kMBLiPNIa30I+BUjAKuw1aN0K5azJec7pdRXSqlspdRmpVSCc91/gZbAT84WojFKqUBnS4eLM80ipdSbzpaJHKXUT0qpBkqpb5ytHqtLt4oopdo5W5kylFJblVJDK9oPpdQwpdSa0557XCn1o/PxNUqpFGed9yulnvobxyoPmApEOsv8UCm111nvtUqp7s7n+wPPA8Oc+7i+VDGtnC0x2Uqp+c7g4pwopZoqpWYppY4qpXYppR4pte6Mr49zfXul1F/OdTOUUtOdr4c38AvQ1FnnHKVUU2c2tzOVV4GrgT9LHbPDWuv/AKvPkH4E8IbW+oTWOhX4HBjpXHcjsFlrPUNrXYARpMWcocUxxLm9aVpru9Y6X2s9X2u9odS+36WUSlVKnVBK/Vq69ayy95nz/fmj83VeBbQptU4ppcYppY44129USkWWqtciYEAlx0uIC54EXkKcR0qp5hg/ltvPIdsg4FugHvAj8DGA1vp2YA/O1jSt9btnyD8cuB2jVaINsByYjNFSkQq84qybN/AbRrDTyJnvP0qp8ArK/AkIVUoFl3ruFmdegC+AUVprX4zA6Y9z2F+c9fEBbgX+cj61GiNgre/czgyllIfWeh7w/4DpzuMQc1qd7nTujxtwTgGgUsqCsa/rMY5fb+AxpVS/UskqfH2cp7xmY7TY1QemATcAaK1zMd4HB5x19tFaH6isvArq5g0EAVuruC9+QIBzX0qsByKcjyNKr3PWcUep9aVtA+xKqS+VUlc7yy69reswguEbAX8gybn/VXmffQIUOOt6l/NWoi/QAyPwqwsMBY6XWp+KcepViIuWBF5CnB8/KKWygb0Yp89eOYe8S7TWc539tv7Luf+wTNZa79BaZ2K0suzQWi/QWtuAGRinPgGuBdK11pO11jat9V/ALGDI6QU6W6PmADcDOAOwdhiBAkAxEK6UquNsXVl3DvV9Sil1EiM49cHZIqO1/lprfdxZt38D7kBoFfZ9m9Y6H/gOZ0vjOUgE/LXWr2uti7TWOzFaiYaXSnOm16cT4AJ8pLUu1lp/D6yqwjar+nrXc95nV3FfSvoVZpZ6LhPwLbU+k7JKrzdprbOAboDGOB5Hna1UjZ1JRgP/0lqnOt9n/w+IdbZ6nfF9poyLQgYDL2utc7XWm4AvS2262FmfdoByln+w1PpsTh0XIS5KEngJcX5c72z96Ynxo3Eup7wOlXqcB3iUnEqsosOlHudXsFzyg9wK6KiUOllyw2hxanKGcqfiDLwwWpZ+cAZkYPx4XgPsVkr9qZTqfA71Hau1rqe1bqK1HqS13gGglHrKeeoq01m3upz9OJ5+7M71ooZWGKcDSx+T54HGpdKc6fVpCuwv1f8KjMD7bKr6ep903pcLjM4gx3lfp9RzdTgVuOWctu709WU4g56RWuvmGK2aTYEPnKtbAR+WOmYZgMJoNazsfeaPEayWPk67S23zD4wWwE+AI0qpz5TR36yEL6eOixAXJQm8hDiPtNZ/Ypx6Gut8KhfwKlnv/Mfvfy5FnrfKGT92fzqDnpKbj9b6TJfo/wb4K6ViMQKwktOMaK1Xa62vwziV9ANGa9Pf5uzPNQbj1JKf1roeRmuMKtnkPym/EnuBXacdE1+t9TVVyHsQaKaUUqWea1Hq8T+qc6lTgSFVTH/CWafSLWgxwGbn482l1zlPCbYptb6ysrdgvK9L+lvtxTjVXPq4eWqtl1H5++woYKPscWp52rY+0lrHA+HOfX+61Oowyp5KFeKiI4GXEOffB8BVSqkYjL4yHkqpAUopV+BFjFNoVXUYaH2e6vUzEKKUul0p5eq8JSqlwipKrLUuxjhV+R5GH6bfwOjbpJS6VSlV15kmC+Nqu3/CF+MH+SjgopR6mbKtM4eBQGefrL9NKeVR+oZxajBbKfWMUspTKWVVSkWqqg2bsBywAw8ppVyc/Z46nFbnBkqpuv+gynOBK07fB069h9ydyyW+Al5USvk5O83fy6mrRmcDkUqpwc48LwMbnEFVGc7O8U86+yyilGqBEXyvcCaZADynlIpwrq+rlCo5ZX3G95nz9Or3wKtKKS9nv68RpbabqJTq6Pys5GL0BSv93roC43S6EBctCbyEOM+01kcxfgBfdva7egCYCOzH+DE5l7Gd/oXxQ3pS/Y0rB0+rVzZG5+XhwAGMU17vUHkgOBXoA8xw9uUpcTuQrpTKwujvcyuUGeC1ZfmiKvUrxjAJ2zBOPRVQ9nTUDOf9caXUufQnK60ZxqnX0rcgjD5JscAu4BjGa3XWYElrXYTRufxujNNft2EEHYXO9VswOpzvdL5+Tc9QVGU+A249rVUtn1OnFbc4l0u8gtFKthvjasj3nBcnlLwvBwNvASeAjpTty1ZatnP9SqVULkbAtQl40lnWbIz3zrfO98AmjIsJqvI+ewjjlPAhjKBwcqnt1sHoU3bCuQ/HMQJ/lFIBGK1gP5zpYAlxMVBluycIIYT4u5RSK4EJWuvJZ01c9TKnAt9prX84X2VejJRS/8a4cOQ/tV0XIf4JCbyEEOJvUkpdgTHcwzGMVr8JQOvTrsQTQgjTuVw5JYQQoqxQjAsLvIGdwE0SdAkhKiMtXkIIIYQQNUQ61wshhBBC1BAJvIQQQgghashF0cerYcOGOjAwsLarIYQQQghxVmvXrj2mta5wsOyLIvAKDAxkzZo1tV0NIYQQQoizUkrtPtM6OdUohBBCCFFDJPASQgghhKghEngJIYQQQtQQCbyEEEIIIWqIBF5CCCGEEDWkWgMvpVQ9pdRMpdQWpVSqUqqzUqq+Uuo3pVSa896vOusghBBCCHGhqO4Wrw+BeVrrdkAMkAo8C/yutQ4GfncuCyGEEEJc8qot8FJK1QV6AF8AaK2LtNYngeuAL53JvgSur646CCGEEEJcSKqzxSsIOApMVkr9pZSaqJTyBhprrQ860xwCGldjHYQQQgghLhjVGXi5AO2B/9NaxwG5nHZaUWutAV1RZqXUfUqpNUqpNUePHq3GahqO7UmnuKCg2rcjhBBCiMtXdU4ZtA/Yp7Ve6VyeiRF4HVZKBWitDyqlAoAjFWXWWn8GfAaQkJBQYXB2vmit+WHsm+SePEHruERCOnWjdVwCrh4e1blZIYQQQlxmqi3w0lofUkrtVUqFaq23Ar2BFOdtBPC2835OddXhXPQb9QhbVywlbeVStq1Ygou7O63bdyC0U1eC4hJwdZcgTAghhBD/jDLO9lVT4UrFAhMBN2AncCfG6c3vgJbAbmCo1jqjsnISEhJ0TU2S7XDY2ZeymW0rlpC2ahl5mSfNIKzDoME0bt22RuohhBBCiIuTUmqt1jqhwnXVGXidLzUZeJV2KghLYtvKZVz76BhaRsaQcWAfx/bupnX7Dri4utZ4vYQQQghx4aos8KrOPl4XPYvFSsvIaFpGRtPrztEopQBIWfwHq3/8nvs//xoXV1dOHj6Et58frm7utVxjIYQQQlzIJPCqIovVaj7uMuRWQjp1w8PbB4BfPv43R3fvonV8B0I7dyMwNl6CMCGEEEKUI4HX32CxWmkU2Npc7jrsNrYuTyJt5TK2LluMq4cnbeI7ENKpqwRhQgghhDBJH6/zyGG3s3fzRratWMK2VcsoyM4yg7C4/gNpGtKutqsohBBCiGomfbxqiMVqpVV0LK2iY+l99/3s3byRrSuSSFu1nNbtE2ka0o6cjOMc2pFGYGy8dMwXQgghLjPS4lUD7DYboLG6uLL2fz+w6KuJ3P3RROo1bkJOxnE8fHxxcXOr7WoKIYQQ4jyQFq9aZnU5dZhj+11LkzYh1GvcBIA/Jn/K7o1/0Sa+IyGduxMYHSdBmBBCCHGJksCrhlldXGjWLtxcjul7De7e3mxftZzUJYtw8/SkTUInQjp1IzCmvZyOFEIIIS4hcqrxAmG32dizaT3bVixh+6rlFOTm4ObpRZuEjkT36U/zdhG1XUUhhBBCVIGcarwIWF1cCIqNJyg2nj73PMCejevZumIJ21cvp1Fga5q3i6AwL499qZtoFR0nLWFCCCHERUhavC5wdlsxDpsdVw8PUpMWMvfjfzP89fdoFhpGfk42bh4eWF0kCBNCCCEuFNLidRGzuriagVVI5254+tahaXAoAEunf82WpYtom9CJkM7daBUVK0GYEEIIcQGTwOsiYnVxJTA23lwO6dgFW2EB21evYPOfv+Pu7U3bhM6Edu5Gy6gYCcKEEEKIC4wEXhexlpExtIyMwVZczJ6NyUbH/NXL2fznAiMIS+xM5BV9aB4eWdtVFUIIIQQSeF0SXFxdad0+kdbtE7EVF7N7w19sW7GEtJXL8PStQ/PwSBx2O3s2JtMiMqbMuGJCCCGEqDnyC3yJcXF1pU18B9rEd8BWXIytqBCAvZs3MutfrzDoyecJ7tCFooJ8Z/8xeQsIIYQQNUV+dS9hLq6u5rATzcIiuH7MS7SMigVg7f9+YN3cH2mb2JnQTl2lJUwIcUkrLizg0PZt7NuymfysLHrdOQowrhyX/rCiJskv7WXCaAnraC43D4vkxMEDbFuRxKaF8/Hw8TWCsM7daBERLUGYEOKilp+dxf4tKezfmsL+1M0c3rUdh90OShHX/1oAtNZMfOhuInv1pevQ29BaU1xYgJuHZy3XXlzK5Nf1MtUiPIoW4VHYiopI3/AX25YnlQnCgjt0JqxbT1pERNd2VYUQospW/ziLzX/+zvF9ewBjcOrGbUJIuPYGmrWLoGlIGB4+PgDYi4uJ7NWXAOcQPScPH2Ty46Np0jqYFhFRtIiIplloOK4eHrW2P+LSI4HXZc7FzY22CR1pm9DRCMLWr2PbiiVsWZaEvbiYFhHRaK3Zn7qZpqFhWKzW2q6yEOIypx0OAJTFQkrSQpbPmMqIf/8HF1dXbEVF1GnoT1i3njRrF06TNiG4uLlVWI6Lmxtdh952atnVjQ7X3cSezRtY8/NsVs2ZicVqpUmbECMQC4+maWg7XN0lEBN/n4xcLypUXFRIUV4e3vX8OLYnnS+ffog+9zxIzFVXYysuxmKxSBAmhKgRtqIiDu1MM04dbtnMga2p3PjcqzQNCSN9w19s+mM+V468D+96fudtm0UF+RzYksLelI3s3byRQzvT0A4HFqsLN7/xHk3aBFOQk4PVzRVXN/fztl1xaZCR68U5c3VzN79M6jVpyqCnXqBZaDgAKX/+zpJvvyK4QxdCOnWjRUSUBGFCiPOmIDeHA9tS2Z+6mf1bUzi0Iw17cTEADZq3JLRzd7MfVmB0HIHRcee9Dm4engTGxpuDVhfm5XFgqxGINWjeAoDVP83ir19+4oGJU3FxcyPr2BG86vrJXLqiUhJ4Oe3fv5969erh7e1d21W54Li4uRGc2Nlcrt+8BS2jYkldsogNv8/D07cOwR2dQVi4BGFCiHOTffwYDruduo0ac+LgfiY9Phq0xmK10rh1W+L6D3T2z2qHV526tVJHdy8vguISCIo71YjRun0HvOvWM09l/vLx+xzavo2AkHbOU5NRNGkbKoHYBcDhcLBr1y7q1q1Lw4YNa7UucqoR48qWjz76iBMnTtCsWTOCg4MJDg4mICAAi8VSbdu92BUXFpCevI6ty5PYuW41xYUFeNapS3CHzrTr0kM65gshytEOBxkH9pGfnUXzsEi0w8En99xMSKdu9L3vYbTDwao5M2ka0o4mbUMuqv5Uu/5aw+6Nf7Fn80aO7t4FWuPi5k7TkHbGBU0R0TRpGyzDV9SgwsJC3N3dKSgoYOzYsSQkJNC/f/9q325lpxol8MIIvA4cOEBaWhppaWns378fAG9vb9q2bUvbtm1p06YNXl5e1VaHi11xYQG7kteybfkSdqxbRbPQcG564Q0ADu/cjn9gEBaLtIQJcbmx24o5vHM7+7eksM/ZP6sgJ5sGzVsy8t//ASBt9XL8mjSlYYtWtVzb8yc/J5t9qZvYt3kjezdv4OiedADCu1/J1Q89idaawzvS8A9sLcP3nGe5ubls2rSJ9evXA3DfffcBsHfvXpo0aYJrDbRASuB1jnJzc9mxYwdpaWls376d/Px8lFL069ePTp064XA4UEqhlKqxOl1MigsLyMvMpG6jxuSePMGE0XfQbdjtdLxhKA6HHUCCMCEuYQe2bWHXX6vZt2Uzh9K2YSsuAsAvoCnN2kXQLDScZmER+DVpWss1rTn52VnsS9mEVz0/moWGcfLwIb545B5633U/sf0GkJ+TzclDB2gc1Fa6a/wNNpuNbdu2sX79etLS0nA4HAQEBBATE0OHDh1q/OyVBF7/gMPhYP/+/aSlpRESEkLz5s3ZvXs3M2bM4Oabb6ZZs2a1Uq+Lha2oiJ3rVtG4dVvqNmrCjrUrmf/peII7diW0U1eahUVIECbERe7wrh2kLP6DHrfeidXFhYVTPuOveT/TKKi1EWi1C6dZaPh5verwYldUkE968lqatA2lTkN/Uhb/wS+fvI+bpyfN2kWYpyYbBbWW78gz0Fqzb98+1q9fz6ZNmygoKMDX15fo6Giio6Np3LhxrdVNAq/z7MCBAyxdupSBAwfi4eHBsmXL2Lp1K8HBwbRt25bGjRtLa9gZHEzbypqfZ7Nz3WpsRYV41a1HSKeuhHTqRrN24fIFI8QFTGvNiYMH2L9lM/u3pBB/7fX4twxk6/Ik5v3fB9z2rw9o0KwFeZkncXFzw81TumdUVX52Fns2rWfv5g3s3byRjAP7AHDz9KJ5WAQtIqJpER5Fo8DWqMu877HWGqUUW7Zs4dtvv8XV1ZWwsDBiYmIICgq6IPpmS+BVzdatW8eqVas4dOgQAL6+vmYH/datW+PuLmO8nK64oICdf61m2/Il7PxrDbaiQrzr+ZlXRzYPi5TgVYha5rDbOZK+0wy09m9NIS/zJAAevnXoN/pR2iZ0xG4rBpT0VTqPck5ksM85htjelI2cOLgfZbHw0KRvcfP04mDaVqyurjQKbF3bVa0xNpuN//73v7Rt25bu3btTXFzM5s2bCQsLu+B+ZyXwqiFZWVls376dtLQ0du7cSWFhIRaLhZYtWxIREUFiYmJtV/GCVFSQz66/1rB1eRK7/lpLvSYBjHjvYwCO79+LX0BTaQkTogaUtCTkZZ7kf+PHcnDbFooLCwCo26ix2TerWWgE9Zs1lz9HNSg74xhHd++idZzxO/Ld689TmJvL7e98CEDaqmXUaxxAwxatLpkWMbvdzs6dOzly5Ahdu3YFYM6cObRo0YL27dvXcu0qJ4FXLbDb7ezdu9e8UtLf358hQ4YA8McffxAcHEyLFi1quZYXnqKCfLKPHaVB85YUFxXyf/fcSnSffvS841601qD1JfOlIkRtsxUVmWNQzXjjeRq2COTKkffhcNiZ/sqzNApqY/TPaheOb/3aHftIlJV9/Bh5mSdp3LotdpuNj+8ahq2wEA/fOrQIizTnmmzQvOVFFyAfOnSI9evXs2HDBnJzc/Hx8eHRRx+tkasRzxcJvC4ANpsNFxcXcnNz+fDDD+nVqxedOnUiNzeXDRs2EBwcTIMGDS66D0h1shUXs331cuo3bU6jwNYc3L6VH8e+RXDHroR07kazkDAJwoSoIq01mUcOO08bGqcOXdzczRaTRf/9gnqNmhDbb0At11T8HVlHj5jTG+1N2UDW0SMAeNapa3TUD48iKC6Buo1qr8N5ZbKzs9m4cSPr16/n8OHDWCwWQkJCiI2NpW3btrhcZKexJfC6wNhsNhwOB25ubqSmpjJ9+nQA/Pz8aNu2LcHBwQQGBuJ2holdL1eHd+1g5ffT2fXXGmzFRfj41Se4pGO+BGFClOFw2Dm6O92c33D/1hRyT2QA4OHtQ9PQMJqHR5Fw7Q3yh+8SlHnksNFR3xmMZR8/ypUjR9H+6oHkZZ4kbdUygjt2rbWZAEocO3aMefPmsWPHDrTWNGvWjJiYGCIjIy/qsTNrLfBSSqUD2YAdsGmtE5RS9YHpQCCQDgzVWp+orJxLLfA63YkTJ8y+Ybt27aK4uBgXFxcCAwPNKyUbNGhQ29W8YBTl57FjndExf1fyGuzFxfjUb0Bwxy6EdupO05B2EoSJy05xUSGHt6fRtF0YFouVP6Z8yl+//ASAb0N/moWG0zzMGEOrQfOW8hm5jJS0drp5euJVpy5blyfx8wfvcNu/PqBx67Yc2LaFY3vSaRERRb0mTas1ENdas3v3bpRStGrVipycHL744gsiIyOJiYmp9el8zpfaDrwStNbHSj33LpChtX5bKfUs4Ke1fqayci71wKu04uJidu/ebQZix48fRynFmDFj8PT05OTJk3h7e19U57qr06kgLIldyWtx9/Jm1IQvsVisZB07gm/9hvIDIy5J+TnZHNiaQqPANvg2aEhq0kLmfvxvbn/nIxoFtubQ9m2cOHSAZu3CqdOwUW1XV1xAtNacPHSAuo2bYLFY+fPrSaz56XsAfOo3MMcQaxEeRd3GTc5LIJaXl4eXlxdaa8aPH0/9+vW57bbbzPpcaq2uF1rgtRXoqbU+qJQKABZprUMrK+dyCrxOl5GRwYEDB4iMjATgyy+/pLCw0JwCITc3Vyb2dirMy+PEwf00aROMdjj47ME7aRUVR/8HHgMuzQ+3uHxkHT3C/i2b2efsn3V83x4Aet01mrh+15KXlcnBtC20iIjGzcOzlmsrLiZaazIO7GNfykb2bN7IvpSN5rAhvg38aREeSYvIGCKu6H1O36F5eXls3ryZ9evXc+zYMZ566ilcXFw4fPgwfn5+l3R3mtoMvHYBJwANfKq1/kwpdVJrXc+5XgEnSpZPy3sfcB9Ay5Yt43fv3l1t9byYbN++HZvNRrt27bDZbLz77rvUqVPHHDesZcuWF10nxOrgsNvZumwxPg0a0iI8iqyjR/j2lWeM05GduxHQNlRawsQFrbiokM2Lfjc7wmcfPwoYA2o2DQ0zTh22i6Bx22Bc3S6sMYzExU1rTcb+vUZHfWc/MZ/6Dbjj3fEArPl5Nn4BTWkT37FcXpvNxvbt20lOTmbbtm04HA4aNWpETEwMiYmJl3SwVVptBl7NtNb7lVKNgN+Ah4EfSwdaSqkTWutK55G4nFu8KlNUVMS6detIS0sjPT0du92Oq6srrVu3NvuG1atXr7areUE4vm8Pi6dOYff6ddhtNnwaNCTU2TFfgjBxoVg3dw6uHp5E9eprDBFw5zDcvb3N+Q2bh0XQsGUrGddO1CjtcJCXlYl3PT+01kx8+B6C4hLoc/f9OBx2fv9iAp5Nm3Os0MbWNGN+Y29vb6KiooiJiaFJk/NzuvJickFc1aiUehXIAe5FTjWed0VFRezatcscNywzMxMAf39/BgwYQGBgYO1W8AJRmJfLjjUr2bpiiRmEWaxW3L198PD2Zvhr7+JVtx5pq5aRvn4dfe5+AGWxsH9rKjkZx/Hw9sHd29u49/HB3ctLfgTFOSvMy+XA1lT2b00hPzuLq+59CIDprz2Ll29dBj7xHGCMXu5dz++y+9ESFzbtcFBcWICLuwfZx44w5cUxnGjaGhwOPG0FBDZpTGRsHK0iY/Cpf3leGFZZ4FVt56SUUt6ARWud7XzcF3gd+BEYAbztvJ9TXXW4nLi5uREaGkpoaChaa44dO2YGYSV9wLZs2cL69esZOHDgRX2Z7j/h7uVNeI9ehPfoZQRha1dxfO9uCvNyKcjJwdXdA4CMA/vZlbzWbAlbP/9/pC5ZVGGZbp5euHt741WnHrf9axwAmxb+RnbGMToPvhmAPZs2YCsqdAZ4p4I3l8uk2f1yl51x7NSwDltSOLon3RwMOKBtKA6HHYvFyk0vvIHV5dSFMz5+9Wuv0kKcgbJY+OGnn3FxceHGG2/kkQlTWL54EZbMDA5tS2XfX6uYt3QhAH4BzWgREUX7q6+jQXMZNByqMfACGgOznf/UXICpWut5SqnVwHdKqbuB3cDQaqzDZUkphb+/P/7+/nTp0sV8Pj8/nxMnTuDhYQQXy5YtIy8vj+DgYJo3b47Venm13Lh7eRPe/coK13W8fggdrx9iLvcccS8drruJgtwcCnJzKczNobDU44LcHLTDYabfvzWVY3vTzcBr6fT/cmBbarntWF1dnYGYD/4tA7n2MeMC33Vz5+Di7kF0734ApG/4C4vFeqq1zdsbd08vOUV6ASo5i6CUYuPC+az8fjqZRw4D4OruQUBIOzoPvpnmYREEtA3F1fl5BMoEXUJcKBwOh3lGpV+/fiilaNy4sfmboSwWuvTsVSq9naPpu8z+YVuW/klUL+O7bPeGZNJWLaPr8Nvx9PGtlf2pbTKA6mVs9uzZbNy4EYfDgYeHR5m+Yb6+l+cHorpkHjlMXuZJM0graWEreVyYk4NXvXr0vut+AKa/+iwePj5c99SLAEwYdTu5J8sOd6eUBXcvo7XN3duHVtFx9LhlJAArvp+Of6sg2sR3MMbNWb8O99KnSb295Uf+PLHbinE4HLi6ubN7YzI/f/guw175Fw1btGLbyqVsWfKn0UerXTiNAltjucz+4IiL15EjR8ype7Kzs3F3d2f06NH4+VXaLbsch92OUgplsZA8fy4rZk3j3k8mY3VxYc3Ps8k8cogW4VE0D4+q9QFdz5cLoo/XPyGBV/UpKChg586d5mnJnJwcAAICAggODiYsLIyAgIBaruXlqfTwF0f3pFOQnUWBM0grHbwV5hmtbo1bt6XLkFsB+L/7bqNdlx5cOfI+iosK+ej2weXKd3F3N4IwL288fHxo17UnsX2vweGws2LWtwTGxNM0pB3FRYUc2r7NbJnz8PbG1cPzsu13VJSfx4FtW9i/NYX9qZs5uH0bV468l+je/ck8cojlM78l8brBNGgmp1XExScnJ4dNmzaxfv16Dh48iFKK4OBgYmJiCAkJOS9jSGqHw2yt/2Pyp2xa+Js5GXvDloHOccSiaB4WiadvnX+8vdoggZeoEq01hw4dIi0tje3bt7N37146duxI//79sdvtbNq0ieDg4Mu2f9jFpuTLzWG3c2jHNiNYy8k5Fbw5A7aS4C24Q2fi+g8kPyeb/9x9M1eOvI/2Vw/i2N7dfPnUg2XKVhaLGYSVBGSx/a6lbUJH8rIy2bTwN9omdqZ+02YU5OZw8tDBU61tXt4XVatP7skTZt+sfVs2czR9F1o7UMqCf2AQzdtF0K7bFQS0rfQaISEuaLm5ucyZM4e0tDS01gQEBJhT9/j4+FTrtu02G4d3pjnnmdzI/q0p2AoLQSn8WwbSrusVdLjupmqtw/lWK53rxcVHKUVAQAABAQH06NGD/Px8bDYbAPv372f27NkMGTKEiIgIsrKyyMrKomnTplikn9EFqeQfpcVqpWlIWJXzefr48vi0OWiH8aesjn8jbnrxzXItbCWPC5z93Ry2YsAY6DNp6hQaNG9B/abN2L8lhR/efb3MNtw8PY3AzcvbeXWoD51uHEaTNsGcPHyInetW065Ld7zq1iMvK5O8zJOlLkhwr9bWthOHDlCYm0uTNsHYiov5/KG7sBcX4+LmTkDbEDreOJRmoeE0DWmHm6f8CREXr71795KdnU14eDienp7k5ubSpUsXoqOjady45ibTtrq40DQkjKYhYXS8YSh2WzGHtqc555ncYHaz0A4H373xPDFXXUO7Lj1qrH7nmwRe4ow8PU+Nft28eXPuu+8+c87IDRs2sGDBAry8vMyJvdu0aSOtYZcIi8UKznjazcOTVlGxVc7buHVbHvlqptmq1aRNMNePeYnCXCNIKxO8OS9OyDxyCHuxEbgd3pnGwimf0jIyGq+69diy9E8WTvnMLN/q4uLsr3YqcPPw9qH7LSOp09CfI+k7ObQjjfDuV+Li5kbuyRPmFaWnX5DgsNs5unsXJw7up13XKwD45ZP3UShufuM9XFxd6TfqEeo2DqBx6zbSL05c9LKzs80+vEuWLOHo0aOEhYVhsVi49957a7l2BquLK83ahdOsXTidbhxmPp+fk41Spz6/x/ft5ZdP/m1Ob9SsXQTuF8FvkJxqFH9LXl6eOZ/kjh07yMvLQylFs2bNzFH0mzRpIq1h4pw57HYKcnPw8PbBYrVy8tBBDu/aXq6FrezVpTkMefEt6vg3YuUPM1gy7Use+Womru4eLPzyc9bNdY5ao5RxQYKXMQbbycOHKC7Ix+riwkOTv8PFzY2DaVtx8/SSS9/FJaOgoMCcumfPnj08+uij+Pn5cfLkSTw9PXF3vzhnPji0fRt/fj2Jg2lbsNtsKGWhces2RiAWEU2z0LBaa5WWPl6iWjkcDg4cOGB20D9w4AAADRs25MEHH0Qphd1uv+yGqxC1o7iokPysTHwb+KOU4vDO7Rzdk15u+I/C3Bzq+DeiWWg4zdpF4NugYW1XXYjzxm63s2PHDtavX8+WLVuw2+00bNiQmJgY2rdvf0nN8VtcVMjBbVvMU5MH07bhsNtQFgtN2gRz7WPP1PhE8dLHS1Qri8VC8+bNad68OVdeeSU5OTns2LGD/Px8sy/Op59+Sps2bejXzxjLRSasFtXF1c0d11Jfso1bt6Vx67a1WCMhakbJBVLr169n48aN5Obm4unpSXx8PDExMTRt2vSS/N51dXOnZWQMLSNjACguLODAViMQO7AtFe96xvAXSdO+xMPbh8RB5a/yrkkSeInzzsfHh5iYGHPZbrcTGhpqdtbMzc01A7Hg4GBat25tDuoqhBDi3NhsNlxcXCgqKmLSpEnmd25MTAxt27bFxeXy+ql3dfegVXQsraJjyzzv28Cf+k2b1U6lSpFTjaLGZWRksGDBAnbs2EFhYSEWi4WWLVuag7c2atTokvxXJmpXfn6+eSsoKKCgoAAPDw8aNGhAnTp1pD+iuCjNmzePPXv2cN999wGwY8cOAgIC5EKnWiZ9vMQFyW63s3fvXrOT/uHDxrQqderUITg4mCuuuII6dS7OwfPE+edwOCgoKDADp4rurVYrvXv3BuCnn36ioKCAIUOMqZ8+/vhjjh07VmHZVquV+vXr07ZtW/N0+IEDB6hTp061j2EkRFU5HA52797Nhg0b6NevHx4eHmzYsIFjx45xxRVXSD/aC4j08RIXJKvVSmBgIIGBgfTp04fMzEwzCNu8ebP5A5qSksLJkyfp1KmTtEpc5Ox2e6XBU0FBAVdddRVKKZYuXcrOnTu5/fbbAZg2bRppaWlnLLskeCp539StW7fMkCg9e/bEZrPh4eGBp6cnHh4e5OXlcfz4cTIyMjh+/HiZ8r7++mvatWvHoEGDcDgczJo1Cz8/P+rXr0+DBg2oX78+Pj4+0jorqt2xY8fMqXsyMzNxc3MjNjaWVq1aER0dXdvVE+dIAi9xwahbty7x8fHEx8fjcDjMICstLY19+/aZE36vWbMGX19fgoKCcHNzq80qX5bsdnulrU6JiYl4eXmxefNmVq9ezW233YaLiwu//vory5cvr7RsFxcXrrjiCtzd3XF1dS3T9y8uLo42bdqUCZxK358+lUmPHmUHWIyMjKxwm0FBQeWe01ozePBg83RNfn4+hw4dIjU1FUepydDd3NzMQKxBgwaEhITQvHnzMhNlC/F35OXlmVP37N+/H6UUbdq0oU+fPoSGhsp330VMAi9xQSrdsnXddddRWFgIGE3tixYtIicnx2wxKxk3rGRwV3F2NpvtjIFTSEgIfn5+7N27lyVLlnD11VdTr1491qxZw6+//kqxc6DTMwkNDcXLywutNQ6Hw+z427p1a9zd3SsMmjw8PPDw8CgTPHXo0IEOHTqYy+Hh4dV2PE5X8iNXwtvbm4cffhi73U5mZmaZVrKMjAwOHDhASkoKnp6eNG/enIyMDD7//HOuv/562rVrR1ZWFunp6WaQVrolTojS7HY7M2fOZOvWrTgcDho1akTfvn2JiooyBz4VFzcJvMRFoWSAP4vFwmOPPcbu3bvNOSXnzZvHvHnz8PPzM4OwwMDA8zKZ64WsuLi4wuApICCARo0acfLkSRYuXEjHjh1p2rQpO3fuZPbs2RQUFFQaPNWpUwc/Pz9sNhsnTpygqKgIAH9/f+Lj488YOJXcl1xBFRkZWaaVqeS1uZiVnM6sX79+uXU2m81sDbNarURGRuLnZ1zGvnv3br7//nszraenp3m6svS9v7//Jf++FeUdPHiQffv2kZiYiNVqxWq10rFjR2JiYmjSpEltV0+cZ9K5Xlz0MjIyzL5hu3btwmaz0bdvX7p06UJhYSG5ubkV/lBeCIqLi8sFTnXq1CEgIICioiJ+//13QkJCaNOmDRkZGUybNs1MVzKP5ulK9v348eN89dVXDBgwgJCQEI4cOcKyZcvKBEpnCp6kk+75ZbPZyMjIKNNKVnKflZVlprvnnnto3rw527dvZ9OmTfTr1w9PT0/y8vJwcXGR00uXkJMnT+Lr64vVamXBggWsWrWKJ5988qIdRV6UJZ3rxSWtfv365mmp4uJidu/ejb+/P2D0D5s5cyb33nsvzZo1Iy8vDzc3t/M2ro3WulzLk6urK02bNgUgKSmJhg0bEhYWhs1m48svvywTaNnt9nJldujQgYCAACwWC8nJyfj5+dGmTRvc3Nxo2LBhpa1NHh4e5umIBg0a8Pjjj5vlNmrUiOuvv/687Lc4Ny4uLjRq1IhGjcqPnl1UVMSJEyc4fvy4+b7NzMxkx44dZuvXn3/+ycqVK/H19S3XSlbSAictZRe+wsJCUlJSWL9+Penp6dx6660EBwfTpUsXunXrJkHXZUJavMQlLTMzk61bt5KQkIDFYmHu3Ln89ddfBAUFmae+6tatS1FRkXlVncPhICAgAID169djsViIiooCYNasWZw4caJMoHV68BQcHMytt94KwPvvv09oaCgDBgxAa83UqVMJDAzEw8MDpVSFN4vFIldvijJsNpt5KtPhcGC32zn9u9tiseDr64tSCpvNhtb6ognGPDw8aN68+UVT33PhcDjYuXMn69evJzU1FZvNhp+fHzExMcTFxVG3bt3arqKoBtLiJS5bdevWLddBW2tNWloa27ZtA4wfrNJXqjVs2JCHHnoIgLVr15YJvIqLi3Fzc6NOnTpnbHUqPfbYY489ZgZRSim6dOmCr68vDRo0kCvexD9ScuGCzWbDbrfjcDjMH/Fjx46htTZb0DIyMtBaY7VacXFxMW9Wq7XW34daa44fP86+ffsqvML0YnX48GFzCIicnBw8PDyIjY0lOjqaFi1a1PpxF7VHAi9xWSkZN0xrzbFjx9i+fTu5ubllAqfSA2aWDIVQYvjw4ee0vdNbrgoKCggMDJQvXfGPWSwW3NzcKuz35efnV6ZFzGKxUFxcTFFRUbmWspJgzN3d3Xzv2+12LBZLjbxPlVI0aNCAo0ePVvu2qltRURFubm5orfnuu+84ceIEwcHBxMTEEBwcfEm26IlzJ4GXuCwppfD39zdbBM6kOjozS9AlqtvpF0fUq1cPoMwQH3a73WwxK7mVpDly5AheXl7UrVsXrTVZWVllWsnOd0vZpfCZWLZsGUlJSTzxxBO4urpy4403Uq9ePby9vWu7auICI4GXEEJcJpRSZuBUmTp16pitM3a7ndzc3HJpTj9l6e7uftlMxqy1Zs+ePaxfv56OHTvSuHFjmjVrRvv27bHZbLi6utKsWe1PxiwuTNKDVwhxTpKSkoiIiCA2Npb8/Pxq286UKVM4cOCAuXzPPfeQkpJSaZ6ePXtS+kKc5ORklFLMmzfPfC49PZ2pU6eWSTN37ty/Xc/AwMAzzgFZHf5pfU+Xk5PD/fffT5s2bWjfvj0JCQlMnTrVbO11cXExx4Zr0KABdevWxdvbG6vVis1mIycnh8zMTHOQ4+LiYo4cOWIuX3HFFSxbtqzCCwIuNhkZGSxcuJCPPvqIyZMns3HjRnOO2VatWnHVVVfJ4LjirC6PvydCiHNit9vP2CryzTff8Nxzz3HbbbdVqSytNVrrc75Sc8qUKURGRppDc0ycOPGc8oMxv2O3bt2YNm0a/fv3B04FXrfccgtgBDJr1qzhmmuuOefya8P5ru8999xD69atSUtLw2KxcPToUSZNmlQmTcmpQHd393JDHmitsdvtZU4XWq1W8/V2OBxkZmZy+PBhlFJmC9npLWYXqvz8fDZv3sz69evZu3cvAK1bt6Znz560a9dOhoAQ567kS/FCvsXHx2shLgUpKSm1XQW9a9cuHRoaqm+55Rbdrl07PXjwYJ2bm6tbtWqlx4wZo+Pi4vS0adP0r7/+qjt16qTj4uL0TTfdpLOzs/Xnn3+u/fz8dGBgoL7lllu01lq/++67OiEhQUdFRemXX37Z3EZISIi+/fbbdXh4uE5PTz9junbt2ul77rlHh4eH66uuukrn5eXpGTNmaG9vbx0SEqJjYmJ0Xl6evuKKK/Tq1au11lqPHj1ax8fH6/DwcLMsrXWZNA6HQwcFBent27frgIAAnZ+fr7XWumPHjrpOnTo6JiZGv/3227pFixa6YcOGOiYmRn/77bd65cqVulOnTjo2NlZ37txZb9myRWuttc1m008++aSOiIjQUVFR+qOPPtJaa92qVSv98ssv67i4OB0ZGalTU1O11lq/8sor+o477tDdunXTLVu21LNmzdJPP/20joyM1P369dNFRUVaa63XrFmje/Toodu3b6/79u2rDxw4YO7LmDFjdGJiog4ODtaLFy/WhYWF5ep7ulWrVunOnTvr6OhonZiYqLOysvTkyZP1gw8+aKYZMGCAXrhwod6+fbsOCgrSdru9wvfKwoULdbdu3fTAgQN1cHCw1lrr6667Trdv316Hh4frTz/91Ezr7e2tH3vsMR0eHq579eqljxw5orXWukePHvqJJ57Q8fHxuk2bNvqnn37Shw4d0vv37y9z27hxo9Za6x07duiFCxeax8dms1X+hq5GP/30k3799df1K6+8osePH6+TkpL0yZMna60+4uIBrNFniGmkxUuIWvLaT5tJOZB19oTnILxpHV4ZGHHWdFu3buWLL76ga9eu3HXXXfznP/8BjEFX161bx7Fjx7jxxhtZsGAB3t7evPPOO7z//vu8/PLLLFmyhGuvvZabbrqJ+fPnk5aWxqpVq9BaM2jQIBYvXkzLli1JS0vjyy+/pFOnTmdNN23aND7//HOGDh3KrFmzuO222/j4448ZO3YsCQnlh8J56623qF+/Pna7nd69e7Nhwwaio6PLpFm2bBlBQUG0adOGnj178r///Y/Bgwfz9ttvM3bsWH7++WcAGjduzJo1a/j4448ByMrKIikpCRcXFxYsWMDzzz/PrFmz+Oyzz0hPTyc5ORkXFxcyMjLMbTVs2JB169bxn//8h7Fjx5qtczt27GDhwoWkpKTQuXNnZs2axbvvvssNN9zA//73PwYMGMDDDz/MnDlz8Pf3Z/r06bzwwgtmi5PNZmPVqlXMnTuX1157jQULFvD666+XqW9pRUVFDBs2jOnTp5OYmEhWVlalp742b95MTExMpa2R69atY9OmTeZQD5MmTaJ+/frmhOiDBw+mQYMG5ObmkpCQwLhx43j99dd57bXX+Pjjj82WsDVr1jB37lzef/99FixYgNa6TCf/kydPArBnzx6WLl1qTnL+888/s3Xr1goHjm3QoMF5bXE6evQomzZtomfPniil8PLyIiEhgZiYGAICAi6JiwBE7ZPAS4jLUIsWLejatStgDJnx0UcfATBs2DAAVqxYQUpKipmmqKiIzp07lytn/vz5zJ8/n7i4OMDoL5SWlkbLli1p1aoVnTp1Omu6oKAgYmNjAYiPjyc9Pf2s9f/uu+/47LPPsNlsHDx4kJSUlHKB17Rp08zhP4YPH85XX33F4MGDz1p2ZmYmI0aMIC0tDaWUOa/lggULGD16tNmBvPQ0VDfeeKNZ/9JzMl599dW4uroSFRWF3W43T3dGRUWRnp7O1q1b2bRpE1dddRVgnOItGbz39HKrcly2bt1KQEAAiYmJAGXGlKuKt956ixkzZnDkyBGzf12HDh3KjK/10UcfMXv2bAD27t1LWloaDRo0wGKxmO+f2267zaz7mfZDKYWrq6vZib8kqOnZsyfdunUzg8E2bdqglCIjI8MciLQ0b29vAgICzFPf+/fvx83N7axXLJfIysrC1dUVT09PDhw4wJIlS4iMjMTf359evXpV/eAJUUUSeAlRS6rSMlVdTv/nXrJccum71pqrrrqKadOmVVqO1prnnnuOUaNGlXk+PT29zGX0laUr3WJhtVrP2mF/165djB07ltWrV+Pn58fIkSMpKCgok8ZutzNr1izmzJnDW2+9ZQ7SmZ2dXWnZAC+99BJXXnkls2fPJj09nZ49e541T8k+lHQ4P/15i8WCq6ureZwtFos5unxERATLly8/p3JL69evH4cPHyYhIYFHH320wjQuLi5lBgkuOV7h4eGsX78eh8OBxWLhhRde4IUXXigzll3p13HRokUsWLCA5cuX4+XlRc+ePcsd+xKl32NV2Y/T61vi9MnWi4qKys17Wdovv/yCi4sLI0eOBOCHH37AYrGUaSXz8fEhLS2NDRs2sHPnTq666iq6dOlCeHg4ISEh0kFeVCu5qlGIy9CePXvMH/upU6fSrVu3Mus7derE0qVL2b59OwC5ubnmSP+l9evXj0mTJpGTkwMYrQ1Hjhz52+lK8/X1rTBQysrKwtvbm7p163L48GF++eWXcml+//13oqOj2bt3L+np6ezevZvBgwcze/bscuWevpyZmWkOBTBlyhTz+auuuopPP/3UDBxO/8H/O0JDQzl69Kj5WhQXF7N58+ZK85xe319//ZXk5GQmTpxIaGgoBw8eZPXq1QBkZ2djs9kIDAwkOTkZh8PB3r17WbVqFQBt27YlISGBF1980Zz6qqCg4IxXH2ZmZuLn54eXlxdbtmxhxYoV5jqHw8HMmTOBit9T54ubmxtNmjQhPDyc7t27c91113HdddeZ6wcOHGi2IJbUecuWLfz2229Mnz6d//znP7z77rvMnj2b48eP0717d9q1awdgtnwJUZ2kxUuIy1BoaCiffPIJd911F+Hh4dx///2MHz/eXO/v78+UKVO4+eabzWEB3nzzTUJCQsqU07dvX1JTU83TkD4+Pnz99dflrlKrarrSRo4cyejRo/H09CzTIlQyx127du3KnDItbdq0adxwww1lnhs8eDD/93//x48//ojVaiUmJoaRI0cyYsQI3n77bWJjY3nuuecYM2YMI0aM4M0332TAgAFm/nvuuYdt27YRHR2Nq6sr9957rzm11N/l5ubGzJkzeeSRR8jMzMRms/HYY48REXHm1tArr7yyTH1LTu+VlDd9+nQefvhh8vPz8fT0ZMGCBXTt2pWgoCDCw8MJCwujffv2Zp6JEyfy9NNP07ZtWxo0aICnpyfvvvtuhdvu378/EyZMICwsjNDQUPNUMhgtY6tWreLNN9+kUaNGTJ8+/R8dm7+rcePGZZZHjBgBGFcnlrSSZWZm0qJFC1q2bCnzoooaJ5NkC1GDUlNTCQsLq9U6pKenc+2117Jp06ZarYe4tPj4+Jgtmn/HhfDZEOJ8qWySbAn1hRBCCCFqiAReQlxmAgMDpbVLnHf/pLVLiMtJtQdeSimrUuovpdTPzuUgpdRKpdR2pdR0pdT5n4VYCCGEEOICVBMtXo8CqaWW3wHGaa3bAieAu2ugDkIIIYQQta5aAy+lVHNgADDRuayAXsBMZ5Ivgeursw5CCCGEEBeK6m7x+gAYA5SM3NcAOKm1LhlBbx/QrJrrIIQQQghxQai2wEspdS1wRGu99m/mv08ptUYptebo0aPnuXZCiL8rKSmJiIgIYmNjzzrK/D8xZcoUc9oaMMbRSklJqTRPz549KT30THJyMkop5s2bZz6Xnp7O1KlTy6SZO3fu365nYGAgx44d+9v5z9U/re+ZvPrqq4wdOxYofxyFEOdPdbZ4dQUGKaXSgW8xTjF+CNRTSpUM3Noc2F9RZq31Z1rrBK11QlXn3BJCnB8lo5hX5JtvvuG5554jOTm5SqN8a63LTFdTVacHXhMnTiQ8PPycypg2bRrdunUrM/XR+Q68atqFXN+qTAckxOWu2gIvrfVzWuvmWutAYDjwh9b6VmAhcJMz2QhgTnXVQQhRXnp6Ou3atePWW28lLCyMm266iby8PAIDA3nmmWdo3749M2bMYP78+XTu3Jn27dszZMgQcnJymDhxIt999x0vvfQSt956KwDvvfceiYmJREdH88orr5jbCA0N5Y477iAyMpK9e/eeMV1YWBj33nsvERER9O3bl/z8fGbOnMmaNWu49dZbzZa10q0w999/PwkJCURERJhlnU5rzYwZM5gyZQq//fabOafgs88+S1JSErGxsbzzzju8/PLLTJ8+ndjYWKZPn86qVavo3LkzcXFxdOnSha1btwJGMPrUU08RGRlJdHR0mZH+x48fT/v27YmKimLLli2A0YI0YsQIunfvTqtWrfj+++8ZM2YMUVFR9O/f35x8e+3atVxxxRXEx8fTr18/Dh48CBitTs888wwdOnQgJCSEpKQkioqKytW3tNWrV5sTUs+ZMwdPT0+KioooKCigdevWAHz++eckJiYSExPD4MGDycvLO+N7xeFwMHLkSHNKoaefftp8DT/99FPAmL+xe/fuDBo06JwDYyEuS1rrar8BPYGfnY9bA6uA7cAMwP1s+ePj47UQl4KUlJSyT0y65uy3JR+WTb/ua+NxzrHyaatg165dGtBLlizRWmt955136vfee0+3atVKv/POO1prrY8ePaq7d++uc3JytNZav/322/q1117TWms9YsQIPWPGDK211r/++qu+9957tcPh0Ha7XQ8YMED/+eefeteuXVoppZcvX37WdFarVf/1119aa62HDBmi//vf/2qttb7iiiv06tWrzXqXXj5+/LjWWmubzaavuOIKvX79+nJplixZonv16qW11vrmm2/WM2fO1FprvXDhQj1gwACz3MmTJ+sHH3zQXM7MzNTFxcVaa61/++03feONN2qttf7Pf/6jBw8ebK4rqUOrVq30Rx99pLXW+pNPPtF333231lrrV155RXft2lUXFRXp5ORk7enpqefOnau11vr666/Xs2fP1kVFRbpz5876yJEjWmutv/32W33nnXea+/LEE09orbX+3//+p3v37l1hfUsrLi7WQUFBWmutn3zySZ2QkKCXLFmiFy1apIcPH6611vrYsWNm+hdeeMGs+yuvvKLfe+89c9vLly/Xw4cP12+++abWWutPP/1Uv/HGG1prrQsKCnR8fLzeuXOnXrhwofby8tI7d+6ssE5VVe6zIcRFDFijzxDT1MhcjVrrRcAi5+OdQIea2K4QomKl5zi87bbb+OijjwDMef9WrFhBSkqKmaaoqMicZ7G0+fPnM3/+fOLi4gBjEM20tDRatmxJq1atzLn8KksXFBREbGwsAPHx8aSnp5+1/t999x2fffYZNpuNgwcPkpKSQnR0dJk006ZNY/jw4QAMHz6cr776isGDB5+17MzMTEaMGEFaWhpKKbNlasGCBYwePRoXF+Nrs379+maeklam+Ph4vv/+e/P5q6++GldXV6KiorDb7fTv3x+AqKgo0tPT2bp1K5s2bTIndbbb7QQEBFRYblWOi4uLC23atCE1NZVVq1bxxBNPsHjxYux2O927dwdg06ZNvPjii5w8eZKcnBz69etXYVmjRo1i6NChvPDCC4DxGm7YsMGcCDszM5O0tDTc3Nzo0KEDQUFBZ62fEEImyRaidt35v7+f3rvBued3MkZ2Kb/s7e0NGC3hV111VZm+URXRWvPcc88xatSoMs+np6ebZZ0tnbu7u7lstVrP2mF/165djB07ltWrV+Pn58fIkSPN04gl7HY7s2bNYs6cObz11ltorTl+/DjZ2dmVlg3w0ksvceWVVzJ79mzS09Pp2bPnWfOU7IPVai3Tz6nkeYvFgqurq3mcLRYLNpsNrTURERFlJgGvSrml9evXj8OHD5OQkMDEiRPp0aMHv/zyC66urvTp04eRI0dit9t57733AGPy8R9++IGYmBimTJnCokWLKiy3S5cuLFy4kCeffBIPDw+01owfP75coLZo0aIyr7UQonIyZZAQl6E9e/aYP/ZTp06lW7duZdZ36tSJpUuXsn37dgByc3PZtm1buXL69evHpEmTzOli9u/fz5EjR/52utJ8fX0rDJSysrLw9vambt26HD58mF9++aVcmt9//53o6Gj27t1Leno6u3fvZvDgwcyePbtcuacvZ2Zm0qyZMcrNlClTzOevuuoqPv30UzMAysjIqLT+VREaGsrRo0fN16K4uJjNmzdXmuf0+v76668kJyczceJEALp3784HH3xA586d8ff35/jx42zdupXIyEgAsrOzCQgIoLi4mG+++eaM27n77ru55pprGDp0KDabjX79+vF///d/Zgvgtm3byM3N/Uf7L8TlSAIvIS5DoaGhfPLJJ4SFhXHixAnuv//+Muv9/f2ZMmUKN998M9HR0XTu3NnsNF5a3759ueWWW+jcuTNRUVHcdNNNFQZLVU1X2siRIxk9enS5YStiYmKIi4ujXbt23HLLLebp0NKmTZvGDTfcUOa5wYMHM23aNKKjo7FarcTExDBu3DiuvPJKUlJSzM7qY8aM4bnnniMuLq5MK9M999xDy5YtiY6OJiYmpsyVkX+Xm5sbM2fO5JlnniEmJobY2FiWLVtWaZ7T63u6jh07cvjwYXr06AFAdHQ0UVFRZmvbG2+8QceOHenatSvt2rWrdFtPPPEEcXFx3H777dxzzz2Eh4fTvn17IiMjGTVqlFzFKMTfoIw+YBe2hIQELWPKiEtBamoqYWFhtVqH9PR0rr32WpkoW1xQLoTPhhDni1JqrdY6oaJ10uIlhBBCCFFDJPAS4jITGBgorV1CCFFLJPASQgghhKghEngJIYQQQtQQCbyEEEIIIWqIBF5CCCGEEDVEAi8hLnOvvvoqY8eOLTMJdWlTpkzhoYceqtY62Gw2nn/+eYKDg4mNjSU2Npa33nrrvJU/cuRIc6obIYSoTRJ4CSFq3YsvvsiBAwfYuHEjycnJJCUlmSOkl6a1xuFw1EINhRDi/JDAS4jL0FtvvUVISAjdunVj69at5vP//e9/iY2NJTIyklWrVpXLVzKafEJCAiEhIfz8888Vlr99+3b69OlDTEwM7du3Z8eOHSxatIhrr73WTPPQQw8xZcoU8vLy+Pzzzxk/fjweHh6AMS3Oq6++ChgDvoaGhnLHHXcQGRnJ3r17uf/++0lISCAiIoJXXnnFLDMwMJAxY8YQFRVFhw4dzCmPABYvXkyXLl1o3bq1tH4JIWqNTJItRC15Z9U7bMkoPw3PP9Gufjue6fBMpWnWrl3Lt99+S3JyMjabjfbt2xMfHw9AXl4eycnJLF68mLvuuqvC8b7S09NZtWoVO3bs4Morr2T79u1mwFTi1ltv5dlnn+WGG26goKAAh8PB3r17K6zP9u3badmyJb6+vmesc1paGl9++SWdOnUCjMCxfv362O12evfuzYYNG4iOjgagbt26bNy4ka+++orHHnvMDA4PHjzIkiVL2LJlC4MGDeKmm26q9DgJIUR1kBYvIS4zSUlJ3HDDDXh5eVGnTh0GDRpkrrv55psB6NGjB1lZWZw8ebJc/qFDh2KxWAgODqZ169bl5nDMzs5m//795lyJHh4eeHl5Vbl+kydPJjY2lhYtWpjBWqtWrcygC+C7776jffv2xMXFsXnzZlJSUsrtw80332xOPg1w/fXXY7FYCA8P5/Dhw1WujxBCnE/S4iVELTlby1RtKJlI+UzLZ0pz55138tdff9G0adMKJ24GcHFxKdM/q6CgAIC2bduyZ88esrOz8fX15c477+TOO+8kMjISu90OgLe3t5lv165djB07ltWrV+Pn58fIkSPNsk6vX+nH7u7u5uOLYY5aIcSlSVq8hLjM9OjRgx9++IH8/Hyys7P56aefzHUlQdOSJUuoW7cudevWLZd/xowZOBwOduzYwc6dOwkNDWXy5MkkJyczd+5cfH19ad68OT/88AMAhYWF5OXl0apVK1JSUigsLOTkyZP8/vvvAHh5eXH33Xfz0EMPmQGU3W6nqKiowvpnZWXh7e1N3bp1OXz4ML/88kuZ9SX7MH36dDp37vzPDpYQQpxn0uIlxGWmffv2DBs2jJiYGBo1akRiYqK5zsPDg7i4OIqLi5k0aVKF+Vu2bEmHDh3IyspiwoQJ5fp3gdFJf9SoUbz88su4uroyY8YMWrduzdChQ4mMjCQoKIi4uDgz/VtvvcVLL71EZGQkvr6+eHp6MmLECJo2bcqBAwfKlB0TE0NcXBzt2rWjRYsWdO3atcz6EydOEB0djbu7O9OmTfsnh0oIIc47dS5N7kopL611XjXWp0IJCQm6ovGFhLjYpKamEhYWVtvV+NtGjhzJtddee8F2TA8MDGTNmjU0bNiwtqsiztHF/tkQojSl1FqtdUJF66p0qlEp1UUplQJscS7HKKX+cx7rKIQQQghxyavqqcZxQD/gRwCt9XqlVI9qq5UQ4oI0ZcqU2q5CpdLT02u7CkIIUakqd67XWp8+CI/9PNdFCCGEEOKSVtUWr71KqS6AVkq5Ao8CqdVXLSGEEEKIS09VW7xGAw8CzYD9QKxzWQghhBBCVFGVWry01seAW6u5LkIIIYQQl7SqXtUYpJR6Xyn1vVLqx5JbdVdOCFH9Xn31VcaOHUvPnj2paNiWKVOm8NBDD9VIXUpva+TIkTKZtRDiklPVPl4/AF8APwGOypMKIUTt0VqjtcZikYk5hBAXnqp+MxVorT/SWi/UWv9ZcqvWmgkhqs1bb71FSEgI3bp1Y+vWrebz//3vf4mNjSUyMpJVq1aVyzdy5EhGjx5NQkICISEh/Pzzz+XSHDlyhPj4eADWr1+PUoo9e/YA0KZNG/Ly8vjpp5/o2LEjcXFx9OnT56yTVr/00kuMHDkSu93Oe++9R2JiItHR0bzyyiuAMYxEaGgod9xxB5GRkebk2kIIcaGpaovXh0qpV4D5QGHJk1rrddVSKyEuE7tvv+OsaXx69qTB3XeZ6evecAP1brwB24kT7H/k0TJpW/33q7OWt3btWr799luSk5Ox2Wy0b9/eDJTy8vJITk5m8eLF3HXXXWzatKlc/vT0dFatWsWOHTu48sor2b59e5lpgxo1akRBQQFZWVkkJSWRkJBAUlIS3bp1o1GjRnh5edGtWzdWrFiBUoqJEyfy7rvv8u9//7vC+j799NNkZ2czefJkfvvtN9LS0li1ahVaawYNGsTixYtp2bIlaWlpfPnll3Tq1Omsx0AIIWpLVQOvKOB2oBenTjVq57IQ4iKSlJTEDTfcgJeXFwCDBg0y1918882AMZF2VlYWJ0+eLJd/6NChWCwWgoODad26NVu2bCE2NrZMmi5durB06VIWL17M888/z7x589Ba0717dwD27dvHsGHDOHjwIEVFRQQFBVVY1zfeeIOOHTvy2WefATB//nzmz59vzvOYk5NDWloaLVu2pFWrVhJ0CSEueFUNvIYArbXWRdVZGSEuN1VpoTpTehc/v3POfzZKqUqXz5Tmzjvv5K+//qJp06bMnTuXHj16kJSUxO7du7nuuut45513UEoxYMAAAB5++GGeeOIJBg0axKJFi3j11VcrrE9iYiJr164lIyOD+vXro7XmueeeY9SoUWXSpaen4+3t/Q/2XAghakZV+3htAupVYz2EEDWkR48e/PDDD+Tn55Odnc1PP/1krps+fToAS5YsoW7dutStW7dc/hkzZuBwONixYwc7d+4kNDSUyZMnk5yczNy5cwHo3r07X3/9NcHBwVgsFurXr8/cuXPp1q0bAJmZmTRr1gyAL7/88ox17d+/P88++ywDBgwgOzubfv36MWnSJHJycgDYv38/R44cOT8HRgghakBVW7zqAVuUUqsp28dr0BlzCCEuSO3bt2fYsGHExMTQqFEjEhMTzXUeHh7ExcVRXFzMpEmTKszfsmVLOnToQFZWFhMmTCjTv6tEYGAgWmt69DCmdO3WrRv79u3Dz88PMIawGDJkCH5+fvTq1Ytdu3adsb5DhgwhOzubQYMGMXfuXG655RY6d+4MgI+PD19//TVWq/VvHw8hhKhJSmt99kRKXVHR85Vd2aiU8gAWA+4YAd5MrfUrSqkg4FugAbAWuP1spzATEhJ0ReMLCXGxSU1NJSwsrLar8beNHDmSa6+9lptuuqm2qyIuMRf7Z0OI0pRSa7XWCRWtq+rI9X9n6IhCoJfWOsc5v+MSpdQvwBPAOK31t0qpCcDdwP/9jfKFEEIIIS4qlQZeSqklWutuSqlsjKsYzVWA1lrXOVNebTSl5TgXXZ23kishb3E+/yXwKhJ4CXFRmDJlSm1XQQghLmqVBl5a627Oe9+/U7hSyopxOrEt8AmwAziptbY5k+zDmHi7orz3AfeB0adECCGEEOJiV9W5Gv9bledOp7W2a61jgeZAB6BdVSumtf5Ma52gtU7w9/evajYhhBBCiAtWVYeTiCi9oJRyAeKruhGt9UlgIdAZqOfMD0ZAtr+q5QghhBBCXMwqDbyUUs85+3dFK6WynLds4DAw5yx5/ZVS9ZyPPYGrgFSMAKzkkqgRZytHCCGEEOJSUWngpbX+l7N/13ta6zrOm6/WuoHW+rmzlB0ALFRKbQBWA79prX8GngGeUEptxxhS4ovzsB9CiGrw6quv0qxZM3Pi7B9//LFW6rFt2zauueYagoODad++PUOHDuXw4cOsWbOGRx55BIBFixaxbNmyWqnfP5WTk8P9999PmzZtzLkzP//88/NWfs+ePZEheYS4MFR1OInnlFLNgFal82itF1eSZwMQV8HzOzH6ewkhLgKPP/44Tz31FKmpqXTv3p0jR45gsVS1l0LV2Ww2XFzKfyUVFBQwYMAA3n//fQYOHAgYQdbRo0dJSEggISHBfM7Hx4cuXbqc97qdjd1u/0eDuN5zzz20bt2atLQ0LBYLR48erXAA2zMdIyHExaOqnevfBpYCLwJPO29PVWO9hBDV6KuvviI6OpqYmBhuv/120tPT6dWrF9HR0fTu3Zs9e/aUyxMWFoaLiwvHjh3j+uuvJz4+noiICHMCazBGkn/88ceJiIigd+/eHD16FIAdO3bQv39/4uPj6d69O1u2bAGMAVlHjx5Nx44dGTNmDH/++SexsbHExsYSFxdHdnY2U6dOpXPnzmbQBUYLTmRkJIsWLeLaa68lPT2dCRMmMG7cOGJjY0lKSuLo0aMMHjyYxMREEhMTWbp0KUCF2wB47733SExMJDo6mldeecXc1tdff02HDh2IjY1l1KhR2O12c1+ffPJJYmJiWL58eZljtXr1arp06UJMTAwdOnQgOzubKVOm8NBDD5lprr32WhYtWsSOHTtYtWoVb775phnQ+vv788wzzwBGQNm9e3cGDRpEeHg4wDkffzCmeurQoQMhISEkJSVV7Y0ihDjvqvrX6QYgVGtdeNaUQogqSfpuG8f25pw94Tlo2MKH7kNDKk2zefNm3nzzTZYtW0bDhg3JyMhgxIgR5m3SpEk88sgj/PDDD2XyrVy5EovFgr+/P5MmTaJ+/frk5+eTmJjI4MGDadCgAbm5uSQkJDBu3Dhef/11XnvtNT7++GPuu+8+JkyYQHBwMCtXruSBBx7gjz/+AGDfvn0sW7YMq9XKwIED+eSTT+jatSs5OTl4eHiwadMm4uMrv5YnMDCQ0aNH4+Pjw1NPGf8Jb7nlFh5//HG6devGnj176NevH6mpqYwdO7bcNubPn09aWhqrVq1Ca82gQYNYvHgx/v7+TJ8+naVLl+Lq6soDDzzAN998wx133EFubi4dO3bk3//+d5m6FBUVMWzYMKZPn05iYiJZWVl4enpW+nrExMRU2oq4bt06Nm3aRFBQEMA5H38wWstWrVrF3Llzee2111iwYEGlx1QIUT2qGnjtxBgAVQIvIS5yf/zxB0OGDKFhw4YA1K9fn+XLl/P9998DcPvttzNmzBgz/bhx4/j666/x9fVl+vTpKKX46KOPmD17NgB79+4lLS2NBg0aYLFYGDZsGAC33XYbN954Izk5OSxbtowhQ4aYZRYWnvoqGTJkiHmarmvXrjzxxBPceuut3HjjjTRv3vxv7+eCBQtISUkxl7OyssjJyalwG/Pnz2f+/PnExRm9I3JyckhLS2PDhg2sXbvWnM8yPz+fRo0aAWC1Whk8eHC57W7dupWAgAAzT506ZxxnukJvvfUWM2bM4MiRIxw4cACADh06mEEXcE7Hv0TJ4/j4eNLT08+pTkKI86eqgVcekKyU+p2yk2Q/Ui21EuIycLaWqQtFSR+vEosWLWLBggUsX74cLy8vevbsSUFBQYV5lVI4HA7q1atHcnJyhWm8vb3Nx88++ywDBgxg7ty5dO3alV9//ZWIiAj+/PPcZy1zOBysWLGi3CTeFW1Da81zzz3HqFGjyqQdP348I0aM4F//+le58j08PMyAsV+/fhw+fJiEhAQeffTRCuvj4uKCw+Ewl0uOWXh4OOvXr8fhcGCxWHjhhRd44YUX8PHxMdOWPkbnevxLuLu7A0bAaLPZKkwvhKh+Ve0h+yPwBrAMYyT6kpsQ4iLTq1cvZsyYwfHjxwHIyMigS5cufPvttwB88803dO/e/Yz5MzMz8fPzw8vLiy1btrBixQpzncPhYObMmQBMnTqVbt26UadOHYKCgpgxYwYAWmvWr19fYdk7duwgKiqKZ555hsTERLZs2cItt9zCsmXL+N///memW7x4MZs2bSqT19fX1+yvBdC3b1/Gjx9vLpcEfhVto1+/fkyaNImcHOPU7/79+zly5Ai9e/dm5syZHDlyxDxWu3fvLlfvX3/9leTkZCZOnEhoaCgHDx5k9erVAGRnZ2Oz2QgMDCQ5ORmHw8HevXtZtWoVAG3btiUhIYEXX3zR7D9WUFCAMevaPz/+QogLS1WvavyyuisihKgZERERvPDCC1xxxRVYrVbi4uIYP348d955J++99x7+/v5Mnjz5jPn79+/PhAkTCAsLIzQ0lE6dOpnrvL29zY7ijRo1Yvr06YARzN1///28+eabFBcXM3z4cGJiYsqV/cEHH7Bw4UIsFgsRERFcffXVuLu78/PPP/PYY4/x2GOP4erqSnR0NB9++CHHjh0z8w4cOJCbbrqJOXPmMH78eD766CMefPBBoqOjsdls9OjRgwkTJpxxG6mpqXTu3BkwOql//fXXhIeH8+abb9K3b18cDgeurq588skntGrV6ozHx83NjenTp/Pwww+Tn5+Pp6cnCxYsoGvXrgQFBREeHk5YWBjt27c380ycOJGnn36atm3b0qBBAzw9PXn33XfP2/EXQlw41Jn+VZVJpNQuyk6SDYDWunV1VOp0CQkJWsagEZeC1NRUwsLCarsa1cbHx8dsNRI172I+/pf6Z0NcXpRSa7XWCRWtq2ofr9KZPYAhQP1/WjEhhBBCiMtJlfp4aa2Pl7rt11p/AAyo3qoJIS42F2try6VCjr8QF74qtXgppdqXWrRgtIDJ8MlCCCGEEOegqsFT6RECbUA6xulGIYQQQghRRVW9qvHK0stKKSswHNhWHZUSQgghhLgUVdrHSylVRyn1nFLqY6XUVcrwELAdGFozVRRCCCGEuDScrXP9f4FQYCNwL7AQ4xTjDVrr66q5bkKIWvbqq6/SrFkzYmNjiYyM5Mcff6yVemzbto1rrrmG4OBg2rdvz9ChQzl8+DBr1qzhkUeMCTQWLVrEsmXLaqV+58urr77K2LFjAWMicBlGR4hLz9lONbbWWkcBKKUmAgeBllrriuenEEJcckqmDEpNTaV79+4cOXKk0gmd/y6bzYaLS/mvpIKCAgYMGMD777/PwIEDASPIOnr0KAkJCSQkJJjP+fj40KVLl/Net7Ox2+3m9EG17UzHUQhxYTjbt2dxyQOttR3YJ0GXEBe/r776iujoaGJiYrj99ttJT0+nV69eREdH07t3b/bs2VMuT1hYGC4uLhw7dozrr7+e+Ph4IiIi+Oyzz8w0Pj4+PP7440RERNC7d2+OHj0KGNP09O/fn/j4eLp3786WLVsAGDlyJKNHj6Zjx46MGTOGP//8k9jYWGJjY4mLiyM7O5upU6fSuXNnM+gCozUoMjKSRYsWce2115Kens6ECRMYN24csbGxJCUlcfToUQYPHkxiYiKJiYksXboUoMJtALz33nskJiYSHR3NK6+8Ym7r66+/pkOHDsTGxjJq1ChzWh8fHx+efPJJYmJiWL58uZl+9erV5oTUc+bMwdPTk6KiIgoKCmjd2hhz+vPPPycxMZGYmBgGDx5MXl7eGV8rh8PByJEjzSmFnn76abOen376KWAEnd27d2fQoEGEh4dX9W0ghKgFZ/tbFKOUynI+VoCnc1kBWmtdp1prJ8Qlbvprz541Tev2HUgceKOZPuKKPkT27ENeViY/jSs7efOwV94+a3mbN2/mzTffZNmyZTRs2JCMjAxGjBhh3iZNmsQjjzzCDz/8UCbfypUrsVgs+Pv7M2nSJOrXr09+fj6JiYkMHjyYBg0akJubS0JCAuPGjeP111/ntdde4+OPP+a+++5jwoQJBAcHs3LlSh544AH++OMPAPbt28eyZcuwWq0MHDiQTz75hK5du5KTk4OHhwebNm0iPj6+0n0KDAxk9OjR+Pj4mBN633LLLTz++ON069aNPXv20K9fP1JTUxk7dmy5bcyfP5+0tDRWrVqF1ppBgwaxePFi/P39mT59OkuXLsXV1ZUHHniAb775hjvuuIPc3Fw6duzIv//97zJ1iYuLM+eFTEpKIjIyktWrV2Oz2ejYsSMAN954I/feey8AL774Il988QUPP/xwuf2y2WzceuutREZG8sILL/DZZ59Rt25dVq9eTWFhIV27dqVv374ArFu3jk2bNhEUFHTW94AQovZUGnhprS+MtnMhxHnzxx9/MGTIEBo2bAhA/fr1Wb58Od9//z0At99+O2PGjDHTjxs3jq+//hpfX1+mT5+OUoqPPvqI2bNnA7B3717S0tJo0KABFouFYcOGAXDbbbdx4403kpOTw7Jlyxgy5NQINIWFhebjIUOGmKfpunbtyhNPPMGtt97KjTfeSPPmzf/2fi5YsICUlBRzOSsri5ycnAq3MX/+fObPn09cXBxgDESalpbGhg0bWLt2LYmJiQDk5+fTqFEjAKxWK4MHDy63XRcXF9q0aUNqaiqrVq3iiSeeYPHixdjtdnPy8U2bNvHiiy9y8uRJcnJy6NevX4X7MGrUKIYOHcoLL7wAwPz589mwYYM5EXZmZiZpaWm4ubnRoUMHCbqEuAhIRwAhalFVWqjOlN6rTt1zzv93lPTxKrFo0SIWLFjA8uXL8fLyomfPnhQUVNwDQSmFw+GgXr16ZivQ6by9vc3Hzz77LAMGDGDu3Ll07dqVX3/9lYiICP78889zrrfD4WDFihV4eHiUeb6ibWitee655xg1alSZtOPHj2fEiBH8619lWxYBPDw8zICxX79+HD58mISEBCZOnEiPHj345ZdfcHV1pU+fPowcORK73c57770HGKdYf/jhB2JiYpgyZQqLFi2qcB+6dOnCwoULefLJJ/Hw8EBrzfjx48sFaosWLSpzHIUQF67z30NWCHFB69WrFzNmzOD48eMAZGRk0KVLF7799lsAvvnmG7NlpiKZmZn4+fnh5eXFli1bWLFihbnO4XCYrTFTp06lW7du1KlTh6CgIGbMmAGA1pr169dXWPaOHTuIiorimWeeITExkS1btnDLLbewbNky/ve//5npFi9ezKZNm8rk9fX1NftrAfTt25fx48ebyyWBX0Xb6NevH5MmTTKn3Nm/fz9Hjhyhd+/ezJw5kyNHjpjHavfu3eXq/euvv5KcnMzEiRMB6N69Ox988AGdO3fG39+f48ePs3XrViIjIwHIzs4mICCA4uJivvnmmzMe67vvvptrrrmGoUOHYrPZ6NevH//3f/9HcbHR/Xbbtm3k5uaeMb8Q4sIjLV5CXGYiIiJ44YUXuOKKK7BarcTFxTF+/HjuvPNO3nvvPfz9/Zk8efIZ8/fv358JEyYQFhZGaGgonTp1Mtd5e3uzatUq3nzzTRo1asT06dMBI5i7//77efPNNykuLmb48OHExMSUK/uDDz5g4cKFWCwWIiIiuPrqq3F3d+fnn3/mscce47HHHsPV1ZXo6Gg+/PBDjh07ZuYdOHAgN910E3PmzGH8+PF89NFHPPjgg0RHR2Oz2ejRowcTJkw44zZSU1Pp3LkzYHSc//rrrwkPD+fNN9+kb9++OBwOXF1d+eSTT2jVqlWlx7hjx44cPnyYHj16ABAdHc2hQ4dQSgHwxhtv0LFjR/z9/enYsWOZgPF0TzzxBJmZmdx+++188803pKen0759e7TW+Pv7l+uLJ4S4sCmtdW3X4awSEhK0jGcjLgWpqamEhYXVdjWqjY+Pj0zULP6WS/2zIS4vSqm1WuuEitbJqUYhhBBCiBoigZcQ4ryR1i4hhKicBF5CCCGEEDVEAi8hhBBCiBoigZcQQgghRA2RwEsIIYQQooZI4CWEqLK33nqLiIgIoqOjiY2NZeXKlYAxV2LpMbVKdOnSBYD09HSmTp1qPp+cnMzcuXNrptKlrFq1ip49exIcHEz79u0ZMGAAGzduPC9lp6enmwOkCiHEmcgAqkKIKlm+fDk///wz69atw93dnWPHjlFUVFRpnmXLlgGnAq9bbrkFMAKvNWvWcM0111R5+zabDReXv/+VdfjwYYYOHcrUqVPNgHDJkiXmSPbnc1tCCHEm0uIlxGXo+uuvJz4+noiICD777DPsdjsjR44kMjKSqKgoxo0bVy7PwYMHadiwIe7u7gA0bNiQpk2blkmTn5/P1Vdfzeeffw4YA6qCMT9iUlISsbGxvPPOO7z88stMnz6d2NhYpk+fTm5uLnfddRcdOnQgLi6OOXPmADBlyhQGDRpEr1696N27d7k6vfPOO0RFRRETE8Ozzz4LQM+ePSkZcPnYsWMEBgYC8PHHHzNixAgz6ALo1q0b119/PWDMnzh69Gg6duzImDFjWLVqFZ07dyYuLo4uXbqwdetWs07XXXed2XL22muvmeXZ7XbuvfdeIiIi6Nu3L/n5+ef2wgghLnnyl06IWnLypx0UHTi/8+y5NfWm3sA2Z003adIk6tevT35+PomJicTHx7N//35z/sOTJ0+Wy9O3b19ef/11QkJC6NOnD8OGDeOKK64w1+fk5DB8+HDuuOMO7rjjjjJ53377bcaOHcvPP/8MQOPGjVmzZg0ff/wxAM8//zy9evVi0qRJnDx5kg4dOtCnTx8A1q1bx4YNG6hfv36ZMn/55RfmzJnDypUr8fLyIiMjo9J93rx5MyNGjKg0zb59+1i2bBlWq5WsrCySkpJwcXFhwYIFPP/888yaNQswTllu2rQJLy8vEhMTGTBgAA0bNiQtLY1p06bx+eefM3ToUGbNmsVtt91W6TaFEJeXamvxUkq1UEotVEqlKKU2K6UedT5fXyn1m1IqzXnvV111EEJU7KOPPiImJoZOnTqxd+9eioqK2LlzJw8//DDz5s2jTp065fL4+Piwdu1aPvvsM/z9/Rk2bBhTpkwx11933XXceeed5YKuqpg/fz5vv/02sbGx9OzZk4KCAvbs2QPAVVddVS7oAliwYAF33nknXl5eABWmqUzHjh0JCwvj0UcfNZ8bMmQIVqsVMCYDHzJkCJGRkTz++ONs3rzZTHfVVVfRoEEDPD09ufHGG1myZAkAQUFBxMbGAhAfH096evo51UkIcemrzhYvG/Ck1nqdUsoXWKuU+g0YCfyutX5bKfUs8CzwTDXWQ4gLUlVapqrDokWLWLBgAcuXL8fLy4uePXtSWFjI+vXr+fXXX5kwYQLfffcdr732GgMHDgRg9OjRjB49GqvVSs+ePenZsydRUVF8+eWXjBw5EoCuXbsyb948brnlFnMy6KrSWjNr1ixCQ0PLPL9y5Uq8vb3Nx6NGjQLg9ddfP2NZLi4uOBwOAAoKCsznIyIiWLduHdddd51Z3syZM81WOMDcFsBLL73ElVdeyezZs0lPT6dnz57mutP3r2S55DQsgNVqlVONQohyqq3FS2t9UGu9zvk4G0gFmgHXAV86k30JXF9ddRBClJeZmYmfnx9eXl5s2bKFFStWcOzYMRwOB4MHD+bNN99k3bp1tGjRguTkZJKTkxk9ejRbt24lLS3NLCc5OZlWrVqZy6+//jp+fn48+OCD5bbp6+tLdnb2GZf79evH+PHj0VoD8Ndff5Uro2PHjmZ9Bg0axFVXXcXkyZPJy8sDME81BgYGsnbtWgBmzpxp5n/wwQeZMmWK2eEfMPOe6Tg1a9YMoEzLHsBvv/1GRkYG+fn5/PDDD3Tt2vWM5QghRGk10rleKRUIxAErgcZa64POVYeAxjVRByGEoX///thsNsLCwnj22Wfp1KkT+/fvp2fPnsTGxnLbbbfxr3/9q1y+nJwcRowYQXh4ONHR0aSkpPDqq6+WSfPhhx+Sn5/PmDFjyjwfHR2N1WolJiaGcePGceWVV5KSkmJ2rn/ppZcoLi4mOjqaiIgIXnrppSrtx6BBg0hISCA2NpaxY8cC8NRTT/F///d/xMXFlRniokmTJkyfPp3nnnuOtm3b0qVLF2bOnMlDDz1UYfljxozhueeeIy4uDpvNVmZdhw4dGDx4MNHR0QwePJiEhISz1lcIIQBUyT/MatuAUj7An8BbWuvvlVIntdb1Sq0/obUu189LKXUfcB9Ay5Yt43fv3l2t9RSiJqSmphIWFlbb1RD/wJQpU8pcGCDOD/lsiEuJUmqt1rrCf2TV2uKllHIFZgHfaK2/dz59WCkV4FwfABypKK/W+jOtdYLWOsHf3786qymEEEIIUSOq86pGBXwBpGqt3y+16keg5JruEcCc6qqDEEKcbyNHjpTWLiHE31adVzV2BW4HNiqlkp3PPQ+8DXynlLob2A0MrcY6CCGEEEJcMKot8NJaLwHOdE15+SGohRBCCCEucTJlkBBCCCFEDZHASwghhBCihkjgJYSosrfeeouIiAiio6OJjY1l5cqVgDFoaekxs0qUTEidnp7O1KlTzeeTk5OZO3duzVS6AiX1TU9PJzIystbqIYS4/EjgJYSokuXLl/Pzzz+bk1YvWLCAFi1aVJqnZJT48xF4nT6IaW270OojhLg4SOAlxGXo+uuvJz4+noiICD777DPsdjsjR44kMjKSqKgoxo0bVy7PwYMHadiwoTkfYcOGDWnatGmZNPn5+Vx99dV8/vnngDGxNsCzzz5LUlISsbGxvPPOO7z88stMnz7dHLk+NzeXu+66iw4dOhAXF8ecOcYoM1OmTGHQoEH06tWL3r3LXpPz4IMP8uOPPwJwww03cNdddwEwadIkXnjhhQr3szI7d+4kLi6O1atXs2PHDvr37098fDzdu3dny5YtgDGUxOjRo+nYsWO50fmFEKIqqnM4CSHEWUyePPmsaUJCQsy5ACdPnkxsbCxxcXHk5uby3XfflUl75513Vmm7kyZNon79+uTn55OYmEh8fDz79+9n06ZNAJw8ebJcnr59+/L6668TEhJCnz59GDZsGFdccYW5Picnh+HDh3PHHXdwxx13lMn79ttvM3bsWHNC6saNG5cZ/f3555+nV69eTJo0iZMnT9KhQwf69OkDYLaw1a9fv0yZ3bt3JykpiUGDBrF//34OHjRmIktKSmL48OEV7ufgwYNp0KBBuX3bunUrw4cPZ8qUKcTExNC7d28mTJhAcHAwK1eu5IEHHuCPP/4AYN++fSxbtgyr1VqlYy2EEKVJi5cQl6GPPvqImJgYOnXqxN69eykqKmLnzp08/PDDzJs3jzp16pTL4+Pjw9q1a/nss8/w9/dn2LBhZSaPvu6667jzzjvLBV1VMX/+fN5++21iY2Pp2bMnBQUF7NmzB4CrrrqqXNAFpwKvlJQUwsPDady4MQcPHmT58uVm37LT97P0JN8ljh49ynXXXcc333xDTEwMOTk5LFu2jCFDhhAbG8uoUaPMoA5gyJAhEnQJIf42afESohZVtYWqovTe3t7nnB9g0aJFLFiwgOXLl+Pl5UXPnj0pLCxk/fr1/Prrr0yYMIHvvvuO1157jYEDBwIwevRoRo8ejdVqpWfPnvTs2ZOoqCi+/PJLRo4cCUDXrl2ZN28et9xyC8bEFVWntWbWrFmEhoaWeX7lypV4e3ubj0eNGgXA66+/zqBBgzh58iTz5s2jR48eZGRk8N133+Hj44Ovr2+F+1lQUFBu23Xr1qVly5YsWbKE8PBwHA4H9erVIzk5ucK6ltRHCCH+DmnxEuIyk5mZiZ+fH15eXmzZsoUVK1Zw7NgxHA4HgwcP5s0332TdunW0aNGC5ORkkpOTGT16NFu3bi3TYpScnEyrVq3M5ddffx0/Pz8efPDBctv09fUlOzv7jMv9+vVj/PjxaK0B+Ouvv8qV0bFjR7M+gwYNAqBTp0588MEH9OjRg+7duzN27Fi6d+9+xv2siJubG7Nnz+arr75i6tSp1KlTh6CgIGbMmAEYQeH69eurfHyFEKIyEngJcZnp378/NpuNsLAwnn32WTp16sT+/fvp2bMnsbGx3HbbbfzrX/8qly8nJ4cRI0YQHh5OdHQ0KSkpvPrqq2XSfPjhh+Tn55freB4dHY3VaiUmJoZx48Zx5ZVXkpKSYnauf+mllyguLiY6OpqIiAheeumlKu1L9+7dsdlstG3blvbt25ORkWEGXhXt55l4e3vz888/M27cOH788Ue++eYbvvjiC2JiYoiIiDA7+wshxD+lSv5hXsgSEhL0mjVrarsaQvxjqamphIWF1XY1hLjgyGdDXEqUUmu11gkVrZMWLyGEEEKIGiKBlxBCCCFEDZHAS4gadjGc3heiJslnQlxOJPASogZ5eHhw/Phx+aERwklrzfHjx/Hw8KjtqghRI2QcLyFqUPPmzdm3bx9Hjx6t7aoIccHw8PCgefPmtV0NIWqEBF5C1CBXV1eCgoJquxpCCCFqiZxqFEIIIYSoIRJ4CSGEEELUEAm8hBBCCCFqiAReQgghhBA1RAIvIYQQQogaIoGXEEIIIUQNkcBLCCGEEKKGSOAlhBBCCFFDJPASQgghhKghEngJIYQQQtQQCbyEEEIIIWqIBF5CCCGEEDVEAi8hhBBCiBoigZcQQgghRA2RwEsIIYQQooZI4CWEEEIIUUOqLfBSSk1SSh1RSm0q9Vx9pdRvSqk0571fdW1fCCGEEOJCU50tXlOA/qc99yzwu9Y6GPjduSyEEEIIcVmotsBLa70YyDjt6euAL52PvwSur67tCyGEEEJcaGq6j1djrfVB5+NDQOMzJVRK3aeUWqOUWnP06NGaqZ0QQgghRDWqtc71WmsN6ErWf6a1TtBaJ/j7+9dgzYQQQgghqkdNB16HlVIBAM77IzW8fSGEEEKIWlPTgdePwAjn4xHAnBrevhBCCCFEranO4SSmAcuBUKXUPqXU3cDbwFVKqTSgj3NZCCGEEOKy4FJdBWutbz7Dqt7VtU0hhBBCiAuZjFwvhBBCCFFDJPASQgghhKghEngJIYQQQtQQCbyEEEIIIWqIBF5CCCGEEDVEAi8hhBBCiBoigZcQQgghRA2RwEsIIYQQooZI4CWEEEIIUUMk8BJCCCGEqCESeAkhhBBC1BAJvIQQQgghaogEXkIIIYQQNUQCLyGEEEKIGiKBlxBCCCFEDZHASwghhBCihkjgJYQQQghRQ1xquwIXioItW3Dk56MsFlAKVMk95nMWT0/cWrUCoGjfPpSrK66NGwNQvH8/WmuUUkY+sxxlPqfc3bH6+gJgz85Gubpi8fBAa40uKDDzKTiV32Ix8gshhBDioieBl9OBZ56lcOvWStN4REURNOM7APY9+BCuzZvT4pOPAdg1ZCj2jIxK8/v07m2m39GvP779+hLwyitgt7M1rn3lFVQKv5uH0+Tll9HO9A0ffJCGo+6j+NAhdg64tsJgr+Q5pRQN7rmb+iNGUHz4CLtvvplGTz1JnWuuoWDrVvY/8mipYK8k/6nysCj8H3gA3z59KExL48CLL9L4mWfxah9H3po1HPngA5SylNom5Zb9H34Yz6go8tev59hnn9P42Wdwa9GCnKQlnPx+ljPALFVnS9ll/4cexLVpU3JXrSLrf3Np9PRTWH18yF60iNylyyrcZun9aDjqPize3uSuWEHe2rX4P/ggADl//klhWhpYrCgXK1itKKsLWC0oq4v5XN0BAwAjSLefPIl3p07G8rZtOHJyjAC9JL2zLGV1lufmZgbp9pxco4re3gBou10CbCH+Bq01OBxgt6NL7p03HA603Y7Fyxurjze6uJji/fuxNmyI1ccHR34+xQcPGd8zVisoi/HYYjE+j857i48PFjc3tM2GLixEeXigrFZje1rLZ1ecMwm8nJq88gqO3BzQ+tQHqtRNOxxY69Qx0zd68gksnp6n8r/4Ao6CQmd6h/MLQZdZdmvWzEzv/8jDZusZFgv+Tz4BGjP9qXpgfLGg8YiMMvPXv+N2PKONZYunJ/VuGlzhNin1nGvLlgAoN1e8EhNx8fc38nt44BERAWh0SX7nNs1lrVEl+2u1YvXxRbmeevsoi9VZZzuUfOlpjdan9kEX2wCML7x9+8xl+8mTFG7dZmxTazSl9sPhMJcb3DkSgOK9+8hesAD/xx4FoDA1lcw5c07l16X2oeT105r6I+4wA6/jn080A6+sX+eT+f33lb9BXFzMwCvjy6/IXbGC4IV/AHDkvbHkJiVVnj0gwEy//7HHsGdlEfTddAB23TjYCPqtVuPL3sXlVMDmvHlERtLi//4DwN4HH8I1IIAmL74AwJ577sWRm3sqULRawcVqvCYuRhDpGRNDg7vuBODwv/6FR2QUdQdei3Y4OPLOu+XSK6sziLRawWrBo10Y3p06oh0OMmf/gEdkBB6hoTjy88lJSipXX8wyrCgXF1waN8a1USN0cTFFe/fh0sgfq48PuqgIe2bmqX0+PWC1XFq9IUremyX75SgsRBcVlQscyixrjXtQEABF+/bjyMvFIyQEgIKt27CfPAkOO9rucN47P382OzjsKA8PfK+8EoCcpCS0zWYuZ/74I7aMDLA7nJ/d8veuTQPwGz4cgGMTPsXFvyH1Bg8G4NDrb+DIzXVus3QdTgVDXvHtaTh6NAB77r0Pn25dqT9iBI6CAtKH31x2Xyu4r3fTYBo99hiO/Hy2de2G/8MP0+DOkRTt3s2Ofv3PeswbPfUkDe65h+IDB9jR/2qavvM2da+7joLNm9l92+1nzV+SPj85md233U7LyZPw7tyZ7F9/Zf/jTxiJVPmATSkjoGv+8cd4d+xA9h9/cPCVV2j11Ve4BwVxctYsjv3fBONzqyzO/Mp4bLWaj5t9MA63Fi3I+uUXMr7+hhafforVx5sTM2aQPe/XU/ksVuf2nX82neUGvPkGFi8vsn75hdwVKwl47VUATn4/m4JNG8ukLVeWm6v5PZn9xx/YDh/G7+abAciaN4/iQ4ec++usb+ljoCxYfH2o07cvALkrV4HWeHfqaCyvWIkjP8/4rCvndk8LgC2+vuZ7vXD7dpSHJ27Njd/Rwp27jENvBsvWU4+VBWW1oDw8sPr4AGDPyUG5uWFxczvra17dJPBy8mofd07pfXr0KLNc55przil/yRcZGKcyG957b5XzKquVRk89ZS5b69al8XPPVTm/i58fTd9521x2a9WKZu//u8r53Vu3puUXE81lr4QEWn31ZZXze3fqROs5P5jLdQdeS92B11Y5f73BN1Jv8I3mcsP776fh/fdXOX+jxx7D/9FHzeWA116lyUsvnvrBs9vRNpvzsQPsNuO+ZHsPPkD9O059YTd64nFsI0eU/dG02c0fQW2zY/FwN9P7DR+Go7Dw1PLNw7EdOWoGreY2bXbjOZsd11JBu1vLlrg0bGguW3x8jG3Z7Gi78a/c+NdvM35A7TYzyAbIXbYM5e4BgLbZODljBtrZOoDNZgSrp/G79VYj8LLZOPjCC/g/9hgeoaHYjmcYraVn4f/YYzQcPYriw0fYec01BLz1FvUG30j+5s3svvmWM2dUClxcCHjjdepdfz35Gzey9977aPbBB3h36khOUhKHXn+jwmDTDGItFhqNeRrPqCjy1v3F8S++oMnzz+HarBnZixaROWtWmUChovum772LW4sWZP70M8c//5xW33yN1deX4198wYlp35YLVk4vI/jPRVjr1uXov/9Nxlf/pd2G9QAcevllMuf8WOmxU25uZvqjH31I/rq/aLvgNwAOv/0v8pavqDS/a7NmZqCVMXkyjvwCc/nYZ59RtH3HmY+91YpX+/bm91XOokW4BQWZgVfe6tU48vJOBckV3Dvy8svsCy7Gz46yWHBt2tQI0C3WcvdYLSiLFY/QUCO9iwt+w4bhERYGgLVePRo++KCZrsJ7qxXPmBgjfYOGNH33HTzjjO96t6Agmo4da3x2HA7jz17px9qBtjvwiDL+4Lo2b06jp5/CzfkH1r1tWxo+/NCptA6H8dprh/GH1W5HawcujYzPnot/I3yv7GW2dLs0aoRn+zjndh3Gn1SzDs4/jg4Hynm8UBaUq6vzTADooiIcOTlGMO/cllmWw17qsfF5Ltq9h7wVp94r+RvWk/3rfOf+Osw/y6X/OFvc3E79Qf1lHvnr15uB14lp35K3cmXl772WLc3A69h//oO228zA69Brr1G0a1el+T0T4gn8+msA9j38CO7tQmk+bhwA6UOH4sjJqTS/79X9zfTbe15JvcE3ntNvZXVRuoIv2QtNQkKCXrNmTW1XQ4jLQpmgw2ZD2+0oFxcsXl5orbEdOIDF1xdrnTrooiIKd+0yAlWHo4KA1Qg+3QMDcQsMxJGbS/YfC/GMjcGtRQtsR4+S/fvvRqBaOti0nwoksTvw7dsXz8gIivbuJWPyFPxuvQX3Nm3I37CBjP9+XeoU06lgE9upfWj0zBg8o6LIWbqUI++NpfmHH+DWqhWZP/3E8c8+rzRwUFYLTV5/HbfmzcleuJCTs2bR9O23sfr4kDV3LtmLFp3hh/9UGf4PPYjFy4vcZcvIX7/e/KOQs3gxhTt2Vhx8OFsNlYuL+ccuf/NmHJmZeHfpAkBBSgr2rOxTrYtmnU9tX7m5ma3rtmPHQClcGjQAwJ6VZbzo5jZLlSGnz0QpJZ9x5WwxcuTlnfrcl5ydsNtPnamwO1AWhWvTpsCpftBuzZsDUJiWhqOgwBkclgr+zKDXjtXXF8/oaAByly/H4uODpzMQzvp1Prq42ExbUQDs2rwFPt27AZDx9Te4t21rBn7VTSm1VmudUOE6CbyEEEIIIc6fygKvS6sDhRBCCCHEBUz6eIlLX0ln+5IrNO3FYC8CN6OvBcUFxgUNFhejb0nJlZFCCCHEeSaBV4niAtB25xWFxpU5p64wdJz6YfZxdlLO3G/8SPs2MZaPpRk/6KXL0I6yZXjUg0btjPTpS8HbH/xDjHPbafNPy1dBGQ3aQPMEI/1fX0FALDSNhcJsSJ5aSX5nGYHdoFUXyD8JSz+EsIHQrD2c3AMrJpwhn/1U/rjbjPzHd8DCt6DrYxAQDXtXw9IPyuZzVHAcrnrNqP+uJFjwKtz4mbFPm2ZB0rgKjt1p+W+daRy/df+F31+DB1eBV334811Y+tGZ8+I8nf70DvBuCIv+Zez/y8eN5//3JCR/Xfb9YHEpdbOC1Q2e3m6s++1l2LMS7v71VP59q8vmUZayyz7+MGi8kX7pR1CcDz2fMZaT3oecI84rg1zKb9tiBd+mED3ESJ/yI7j7QhujgzRb5hrHTllL5XEpW56nn3GswXj93OsYddLauW2XCrZvlQBU/DMlf3pKrk4tyjPeVy7uxvN5GWW/Z8p9bzjAoy74NDLWHd4EPk3At7HxGTq4/rQ8JeWUyu8farz3C7Nh26/QPBH8WkHuMdi5yPlHy1LqT1fJVXbO5xuFG9/zBVlwJAX824FnPcg/YXx3mumtp8oqvezVEFw9wFYIRbnG/lisxu+Fw1Z+m+KSJ4FXiYl94PDGytM0i4d7jSEBmDoM6rWEm6cay5P6Q96xyvOHDjiV/rs7IPw6uPZ948th2rCz1zHhbiNw0Q746VG48kUj8CrIhF/GnD3/lS8agVNRDiwbb3wZNWtvfAGt+6rUuF2Wir9A2vY2yinOM77wCrNOLZ9IryDvaV9EJf0JrW7Gl0/Jj7qbr3EslSr/5VX6VtJC5RdoBI1WV2M5IBba31Fx3UvfXJ3DYbTtY2y/RMQN0DDY+BJ02I0vb4ft1LLjtCv96rcGW9GpZa+G4BtQPo+t4NRj+6mrGDmSYvwIlNg2D46knrY9e9nXrnniqcBr0b+M164k8JrzgPEjUJl218Lwb4zHE/tA5GAYMNb48v93yJnzlQSQifdC//9npH8/DLo/CZ3uh6wDMOXaUsGapeLgMe52iLrJeK/9/Dh0uBeCehhB4JL3y+c5PXBtdw00iYLsQ7B5NrQbYLxnTuyG3UudQWclgWvjSCNIL/mxbBhivB/yMowyy/zw6/KBQLP2xvvvxG44tg1a9zTef4dT4Hha+YDBXHbex95qBBu7l8OhjdDxPuP4bv0FDm8+wx8W52NlNf60APz1DZzYBb1eNJaXfGC8d8oFHqX+OHo3hOuM8QOZ/6Lx3r3mXWN55t1GeeX+bJYqJyAGhkw20k+5Fhq0hYEfGMsfREPByVKBzmnHDQ3h18NQ51XP/24HsbfA1W8bgdN7rSt/3wJ0HA1Xv2MELp/2gD6vQrfHIfsgTOp39vwl6XOOwKy74YZPjcDr2DZj+Wxu+BRihhtB3+Sr4Y45xuu/4w+YedfZ85ek3/I/mHknPLDS+AO56nP4tYIr7MoEYla470/jD/rqL4w/vA+vMwK/xWNh7RTn997pwV+pQG7Ez+BRx9he6k8wwnkV7ZJxsGtx5YGjiwdcbwxjw9ovje/5Pq8Yyys/g4wdzvyqgm1bwbP+qff6pu+N90TUTcbyhhlQmFk+j7msjIA7qIeRftdi4w9j01hjOX2J8/Nxev5Sx8PTD+q1MNJn7AT3uuDd4OyvWTWTwKtEp/uNwKlc4FDqx9z71CX59HoR3LxOLQ/6yPhRqugNXFJG6fw3f2v8EICR/t6FlQcNSp0KFixWeCLVaPUA40d/zK6zBx7K+W+qbnN4uVSQ2Kw9PL+v6seqSRQ8vPbUcusr4P6lVc/fsiPc/v2p5ZC+xq2qgrobt7+bv1UX41YiuI9xq6r4kWWXrzzHy5NvmFB2+e755dNofSoIc9jKrrtjzqnXEuCu+cap09KBW0nwVvKcV6kvm2vHnfoyUhYY8G/nFUW28sFjSTnNE0/lDxto/PgCWFyhaVzZfKcHrsVFxmcDjB/PY2lG6wEYP9o7Fp552yX7Xq+l8b47kQ7znjVaMeq1hAPr4IcqDCVy+2xo8//bu/9gO8r6juPvT24SjQEJiiQZQnqZgUGTDEZsERAziJYSpTAqHdIZKVg7FAsV7Q9G/cOi/1jrjLV1HCjir7YwpERhIo0ITqmDU5JCkBJItJMGMIlIIpQQhCbm5ts/9jnczTlnz9mb3LN7757PK3Mn5+w+u/t8z7Nnz3ef/XUebPsh3H45fOQBmL8EHl2dza+fP1kPx78JfrouK3/dE9n399Hbst7Tfpa+L0u8/vtuWH/D+I/R43dm88jL77C0fvxaidfPH4adG8cTr2ceg+0bOqfJb7/yxtrWp9mvyXri26fJz+e4XGK++Mxse9Oy5OJs3eu2o9aqU6uXH+Cdnxp/PzIbVn6hc3ntsbR6ame+ClbdmvU4Qdbzddkd3afJ1+e12VV1HHMiXP1g1lsG2Q7b1Q/27nE7ODa+rh//Jvjgd2DBaemzOAsuvaWgpy03v+NOTct7M6z8myyZgGw7+O7ri5P91vs587Lyrz95fD2CbAfwpBWd5dtjmDGSlW/1NLYc2Jd9D3vVfyR3z6tnHst2uFu23Zcduem67LHxOrbW9Ye+no1vJV4//Hy209LL4rPHE6+7/gwWLIPf+2b2/tZVsH9v4aQALH3/+E7DjSvg9Mvggs/1nqYCvqrRzKa2gwez/2fMyBKH/Xth1lyYOTs7dPOr3blkrUtv5cEDMH9plii98HSWvIy+I+sFePZ/sh6oXoeaNDLe47X3GdizPfsRHZmV9Za99GzxdK35zj0+G77/pSxRaf2YtnpO83vrZk3Q6j1tJX6/fjkb1uqweOm53M5at97asSxRPHY0K//M5uyQ7etSL+nPNsDBX7dNH4e+P3phdpQIslNajj0p+y5XwLeTMDMzM6uIbydhZmZmNgU48TIzMzOrSC2Jl6QLJP1U0lZJJc5sNTMzM5v+Kk+8JI0AXwFWAkuA35e0pOp6mJmZmVWtjttJnAFsjYhtAJJuAy4GNtdQFwB+cMt6xp7KLksVeuV+mzD+8tBrjQouSGgbrBLXLYjOq5iCKJy28Jqn9mWnku2zOWToRJYRPceWqpeIriVVXJVDPoeJXu9V5vOfyHy71r7HMrpHOwGTeN1LUT0KP/ca6jJhR1inbt+9ic1cPd6Vnaq8I/ncBr3M9nLF6365Ob5Sqk8z1PFZHonDW273LWTv9Xdwev8+9Cp0mPMqPaco+HXp9Nz58zjn3csPZ0GToo7E6wRge+79DqCax4UX+L8tu1h+4Ni+5YquAC27npQpNxnz6jeP6JU4qmAe5XLNguGD+dwK4ygx/85x5b/tg7wOOLq8mvx5H6Yemexk17bM+lNu2omXPJK1deLr9GA+0/Zpo2COg98mDepzKx46mdvjiUwxmd+B8jFMdA+93/IOs72OIAcsP69+W/zyxjRnkuZ0eKbsDVQlXQlcCbB48eKBLmv0mtPYtGd3fuHjL1HXW+sU7W1I3feC24d3luic/0SW274MFQwvM68Zfe4l1D7dIctVcbn8mIksV2p/0Tl/FQw/dB7FcXVbdvf6qOP4fOHnq2516WzXVplubVbcdl2mz3/2betwx5QF7aTO0YXr8Pj7znboXEe6fTc64x6vQ7c2LNdGSv86hnc5sWJGr+8T6tG23dfubp9lfppur81suNSReO0ETsy9X5SGHSIibgJuguw+XoOs0LIFoyxbMDrIRZiZmZnVclXjg8Apkk6SNBtYBaytoR5mZmZmlaq8xysiDki6Bvg+MAJ8PSIer7oeZmZmZlWr5RyviFgHrKtj2WZmZmZ18Z3rzczMzCrixMvMzMysIk68zMzMzCrixMvMzMysIk68zMzMzCrixMvMzMysIk68zMzMzCqiogc/TyWSdgNPDXgxxwG/HPAyprJhjt+xD69hjn+YY4fhjt+xD95vRMQbuo2YFolXFSQ9FBG/WXc96jLM8Tv24Ywdhjv+YY4dhjt+x15v7D7UaGZmZlYRJ15mZmZmFXHiNe6muitQs2GO37EPr2GOf5hjh+GO37HXyOd4mZmZmVXEPV5mZmZmFRmqxEvSiZLuk7RZ0uOSru1SRpL+XtJWSY9KOr2Ouk62krGfK2mPpEfS36frqOsgSHq1pP+U9F8p/s90KfMqSatT22+QNFpDVSddydivkLQ71/Z/VEddB0XSiKQfS7qry7hGtnten/ib3vZPStqUYnuoy/hGbvOhVOxN3ubPk7RG0k8kbZF0Vtv42tp9ZlULmiIOAH8eEQ9LOhrYKOneiNicK7MSOCX9vQ24If0/3ZWJHeD+iLiwhvoN2j7gvIh4UdIs4EeSvhcR63NlPgz8b0ScLGkV8Hng0joqO8nKxA6wOiKuqaF+VbgW2AK8tsu4prZ7Xq/4odltD/DOiCi6d1NTt/ktvWKH5m7z/w64OyIukTQbeE3b+Nrafah6vCLi6Yh4OL3eS7YhOqGt2MXAP0ZmPTBP0sKKqzrpSsbeWKk9X0xvZ6W/9hMcLwa+lV6vAd4lSRVVcWBKxt5YkhYB7wVuLijSyHZvKRH/sGvkNn+YSToGWAF8DSAi9kfE823Famv3oUq88tLhhLcAG9pGnQBsz73fQcMSlB6xA5yVDkl9T9LSams2WOlwyyPALuDeiChs+4g4AOwBXl9pJQekROwAH0hd7msknVhtDQfqS8B1wMGC8Y1t9+RL9I4fmtv2kO1k3CNpo6Qru4xv8ja/X+zQzG3+ScBu4BvpEPvNkua2lamt3Ycy8ZJ0FPBt4GMR8ULd9alSn9gfJnvMwZuBLwN3Vly9gYqIsYhYDiwCzpC0rOYqVaZE7N8FRiPiNOBexnuApjVJFwK7ImJj3XWpQ8n4G9n2OedExOlkh5aulrSi7gpVqF/sTd3mzwROB26IiLcAvwI+UW+Vxg1d4pXOcfk2cEtEfKdLkZ1Afo9vURo27fWLPSJeaB2Sioh1wCxJx1VczYFLXc73ARe0jXql7SXNBI4Bnq20cgNWFHtEPBsR+9Lbm4G3Vly1QXk7cJGkJ4HbgPMk/XNbmSa3e9/4G9z2AETEzvT/LuAO4Iy2Io3d5veLvcHb/B3AjlzP/hqyRCyvtnYfqsQrnbfxNWBLRHyxoNha4A/SFQ9nAnsi4unKKjkgZWKXtKB1boukM8jWj0b8AEl6g6R56fUc4LeBn7QVWwtcnl5fAvxbNOBGd2Vibzu34SKycwCnvYj4ZEQsiohRYBVZm36wrVgj2x3Kxd/UtgeQNDddTEQ61HQ+8FhbsaZu8/vG3tRtfkT8Atgu6dQ06F1A+4VktbX7sF3V+HbgMmBTOt8F4FPAYoCIuBFYB7wH2Aq8BHyo+moORJnYLwE+IukA8DKwqik/QMBC4FuSRsg2Lv8SEXdJ+izwUESsJUtM/0nSVuA5sh+qJigT+0clXUR29etzwBW11bYCQ9LuhYao7ecDd6TcYiZwa0TcLekqaPw2v0zsTd7m/ylwS7qicRvwoanS7r5zvZmZmVlFhupQo5mZmVmdnHiZmZmZVcSJl5mZmVlFnHiZmZmZVcSJl5mZmVlFnHiZ2ZQhaUzSI5Iek3S7pPYH2+bLnivp7Nz7b0q6pMQyXuxXZqIkLZf0ntz76yX9xWQvx8ymPydeZjaVvBwRyyNiGbAfuKpH2XOBs3uMr9JysnsCmZn15MTLzKaq+4GTJf2upA3pYbc/kDQ/Pej9KuDjqYfsHWmaFZL+Q9K2kr1ffynpwfSA6M+kYaOStkj6qqTHJd2T7viPpN9KZR+R9IXUMzcb+CxwaRp+aZr9Ekn/nury0cn+cMxsenLiZWZTTnpm4kpgE/Aj4Mz0sNvbgOsi4kngRuBvUw/Z/WnShcA5wIXAX/dZxvnAKWTPr1sOvDX3EOFTgK9ExFLgeeADafg3gD9ODxwfA4iI/cCngdWpLqtT2TcCv5Pm/1fpWalmNuSG7ZFBZja1zck90up+ssf5nAqsTs8UnA080WP6OyPiILBZ0vw+yzo//f04vT+KLOH6GfBERLTqsREYTc+7PDoiHkjDbyVL8Ir8a3r49D5Ju8ge4bKjT53MrOGceJnZVPJy6k16haQvA1+MiLWSzgWu7zH9vvykfZYl4HMR8Q9tyxttm88YMKfPvPrVZQxvb80MH2o0s6nvGGBnen15bvhe4OgjmO/3gT+UdBSApBMkHV9UOCKeB/ZKelsalH+Y9pHWxcyGhBMvM5vqrgdul7QR+GVu+HeB97WdXF9aRNxDdrjwAUmbgDX0T54+DHw1HQ6dC+xJw+8jO5k+f3K9mVkHRUTddTAzmxYkHRURL6bXnwAWRsS1NVfLzKYRn3NgZlbeeyV9kmzb+RRwRb3VMbPpxj1eZmZmZhXxOV5mZmZmFXHiZWZmZlYRJ15mZmZmFXHiZWZmZlYRJ15mZmZmFXHiZWZmZlaR/wd+5LmtinetbwAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "seed_idx = list(range(2,max_depth +1))\n",
+    "\n",
+    "plt.figure(figsize=(10,5))\n",
+    "\n",
+    "for i in range(len(data)):\n",
+    "    plt.plot(seed_idx, time_algo_cu[i], label = (str(names[i] + \"-cuGraph\")))\n",
+    "\n",
+    "    plt.plot(seed_idx, time_algo_wk[i], label = (str(names[i] + \"-walker\")),  linestyle='-.')\n",
+    "\n",
+    "\n",
+    "plt.title(f'Runtime vs. Path Length ({num_seeds} Seeds)')\n",
+    "plt.xlabel('Path length')\n",
+    "plt.ylabel('Runtime')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "5164"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del time_algo_cu\n",
+    "del time_algo_wk\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test 2: Runtime Speedup versus number of seeds\n",
+    "The number of seeds will be increased over a range in increments of 50.  \n",
+    "The runtime will be the sum of runtime per increment.  Increaing number of seeds by 1 would make for very long execution times "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading ./data/preferentialAttachment.mtx...\n",
+      "\t.Random walks - T=2.28s\n",
+      ".Random walks - T=2.29s\n",
+      ".Random walks - T=2.28s\n",
+      ".Random walks - T=2.21s\n",
+      ".Random walks - T=1.95s\n",
+      ".Random walks - T=2.38s\n",
+      ".Random walks - T=2.00s\n",
+      ".Random walks - T=2.19s\n",
+      ".Random walks - T=1.99s\n",
+      ".Random walks - T=2.40s\n",
+      ".Random walks - T=1.95s\n",
+      ".Random walks - T=2.17s\n",
+      ".Random walks - T=1.95s\n",
+      ".Random walks - T=2.39s\n",
+      ".Random walks - T=1.97s\n",
+      ".Random walks - T=2.23s\n",
+      ".Random walks - T=2.63s\n",
+      ".Random walks - T=4.08s\n",
+      ".Random walks - T=3.44s\n",
+      ".Random walks - T=3.77s\n",
+      " \n",
+      "Reading ./data/dblp-2010.mtx...\n",
+      "\t.Random walks - T=6.61s\n",
+      ".Random walks - T=6.57s\n",
+      ".Random walks - T=6.48s\n",
+      ".Random walks - T=6.69s\n",
+      ".Random walks - T=6.11s\n",
+      ".Random walks - T=6.18s\n",
+      ".Random walks - T=4.98s\n",
+      ".Random walks - T=5.64s\n",
+      ".Random walks - T=3.83s\n",
+      ".Random walks - T=4.28s\n",
+      ".Random walks - T=4.34s\n",
+      ".Random walks - T=4.14s\n",
+      ".Random walks - T=3.79s\n",
+      ".Random walks - T=4.37s\n",
+      ".Random walks - T=4.00s\n",
+      ".Random walks - T=3.66s\n",
+      ".Random walks - T=4.01s\n",
+      ".Random walks - T=3.67s\n",
+      ".Random walks - T=4.32s\n",
+      ".Random walks - T=3.70s\n",
+      " \n",
+      "Reading ./data/coPapersCiteseer.mtx...\n",
+      "\t.Random walks - T=56.64s\n",
+      ".Random walks - T=52.26s\n",
+      ".Random walks - T=45.66s\n",
+      ".Random walks - T=48.81s\n",
+      ".Random walks - T=56.16s\n",
+      ".Random walks - T=56.73s\n",
+      ".Random walks - T=45.43s\n",
+      ".Random walks - T=44.96s\n",
+      ".Random walks - T=51.77s\n",
+      ".Random walks - T=58.39s\n",
+      ".Random walks - T=43.35s\n",
+      ".Random walks - T=42.89s\n",
+      ".Random walks - T=57.96s\n",
+      ".Random walks - T=45.03s\n",
+      ".Random walks - T=64.27s\n",
+      ".Random walks - T=52.57s\n",
+      ".Random walks - T=46.91s\n",
+      ".Random walks - T=55.62s\n",
+      ".Random walks - T=46.85s\n",
+      ".Random walks - T=44.84s\n",
+      " \n",
+      "Reading ./data/as-Skitter.mtx...\n",
+      "\t.Random walks - T=51.36s\n",
+      ".Random walks - T=52.06s\n",
+      ".Random walks - T=44.91s\n",
+      ".Random walks - T=49.73s\n",
+      ".Random walks - T=47.45s\n",
+      ".Random walks - T=52.21s\n",
+      ".Random walks - T=47.65s\n",
+      ".Random walks - T=45.49s\n",
+      ".Random walks - T=47.84s\n",
+      ".Random walks - T=43.48s\n",
+      ".Random walks - T=45.67s\n",
+      ".Random walks - T=45.75s\n",
+      ".Random walks - T=55.03s\n",
+      ".Random walks - T=46.39s\n",
+      ".Random walks - T=50.64s\n",
+      ".Random walks - T=43.87s\n",
+      ".Random walks - T=40.98s\n",
+      ".Random walks - T=49.42s\n",
+      ".Random walks - T=51.94s\n",
+      ".Random walks - T=49.28s\n",
+      " \n"
+     ]
+    }
+   ],
+   "source": [
+    "# some parameters\n",
+    "rw_depth = 4\n",
+    "max_seeds = 1000\n",
+    "\n",
+    "# arrays to capture performance gains\n",
+    "names = []\n",
+    "\n",
+    "# Two dimension data\n",
+    "time_algo_cu = []       # will be two dimensional\n",
+    "time_algo_wk = []       # will be two dimensional\n",
+    "perf = []               # will be two dimensional\n",
+    "\n",
+    "i = 0\n",
+    "for k,v in data.items():\n",
+    "    time_algo_cu.append([])\n",
+    "    time_algo_wk.append([])\n",
+    "    perf.append([])\n",
+    "    \n",
+    "    # Saved the file Name\n",
+    "    names.append(k)\n",
+    "\n",
+    "    # read data\n",
+    "    gdf = read_data(v)\n",
+    "    pdf = gdf.to_pandas()\n",
+    "    \n",
+    "    # Create the Graphs\n",
+    "    Gcg = create_cu_ugraph(gdf)\n",
+    "    Gnx = create_nx_ugraph(pdf)\n",
+    "        \n",
+    "    num_nodes = Gcg.number_of_nodes()\n",
+    "    nodes = Gcg.nodes().to_array().tolist()\n",
+    "    \n",
+    "    print('\\t', end='')\n",
+    "    for j in range (50, max_seeds +1, 50) :\n",
+    "        print('.', end='')\n",
+    "        seeds = random.sample(nodes, j)\n",
+    "        tc = run_cu_rw(Gcg, seeds, rw_depth)\n",
+    "        tw = run_wk_rw(Gnx, seeds, rw_depth)\n",
+    "        \n",
+    "        time_algo_cu[i].append(tc)\n",
+    "        time_algo_wk[i].append(tw)        \n",
+    "        perf[i].append(tw/tc)\n",
+    "        \n",
+    "\n",
+    "    # update i\n",
+    "    i = i + 1\n",
+    "    print(\" \")\n",
+    "    \n",
+    "    del Gcg\n",
+    "    del Gnx\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmQAAAFNCAYAAACuWnPfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAABhrElEQVR4nO3deXhU5d3/8fd3JvtKQsK+KyBbCGFRFNndqlZbta4t1Nra/uxma936WFuf2mprH6u2fazdsNatWn1cq1ZFRQUREBCRHZSdhED2ZDIz9++PORkmIUDAJJPA53Vdc80591nmOzMJfHKf+5xjzjlEREREJH588S5ARERE5FinQCYiIiISZwpkIiIiInGmQCYiIiISZwpkIiIiInGmQCYiIiISZwpkInLYzOwNM7sq3nV0VGa2ycxmxum1u5vZW2ZWYWa/iUcNXh0DzMyZWUK8ahDpTBTIRDoRM5tkZu+aWZmZlZrZO2Y2Pt51dWQxweDFJu3/MLOfxqmstvQNoATIcs79sOlCM+tjZv8ysxLv52iFmc1u9ypFpBEFMpFOwsyygOeB+4BcoDfwM6AunnV1Iiea2cnxLuJwHGHvUn9gpTvwVb8fAjZ763UFvgzsPLIKRaS1KJCJdB5DAJxzjzrnQs65GufcK8655QBmNtvrMfud1/OxysxmNGxsZtlm9hcz225mW83s52bmj1l+pZl9bGZ7zOxlM+sfs+w0b39lZvY7wGKW/dTM/hEz3+hQlXd485dmttDMys3sGTPLbe4Neq9/Tsx8gpkVm1mRmaV4vVq7zWyvmb1vZt0P4/P7FXD7AV53tpm93aTNmdnx3vQcM/uDmf3bzCq9z7mHmf3W+7xWmdmYJrsdb2YrveV/M7OUmH2fY2ZLvffxrpkVxCzbZGY3mNlyoKq5UGZmJ3vvv8x7PrmhTmAWcL1XZ3OHTccDc5xzVc65oHPuA+fcv2P2fZJX014zW2ZmU2OWHfBnyMz8ZnaX1/O2ATi7mc94g3codaOZXd7cdyFyrFIgE+k81gAhM3vQzM4ys5xm1jkRWA/kAbcCT8WEnzlAEDgeGAOcDlwFYGbnATcDXwTygXnAo96yPOAp4L+8/a4HTjnM2r8CXAn09Gq49wDrPQpcGjN/BlDinFtCJGhkA32J9Ox8E6g5jBr+AAw5QEhpiS+x7zOoA+YDS7z5J4H/abL+5V79xxEJ0/8F4AW3vwJXe+/jj8CzZpYcs+2lRAJNF+dcMHan3vf5ApHPsKv3ui+YWVfn3GzgYeBXzrkM59yrzbyPBcDvzewSM+vXZN+9vX3/nEgv7HXAv8ws31tlDgf4GQK+DpzjtY8DLozZb7pX71nOuUzgZGBpM7WJHLMUyEQ6CedcOTAJcMCfgGIze7ZJL9Eu4LfOuXrn3OPAauBsb53PAd/3ekZ2AXcDl3jbfRP4pXPuYy8A/AIo9HrJPgd85Jx70jlXD/wW2HGY5T/knFvhnKsCbgG+FNs7F+MR4PNmlubNX4YXDIF6IgHkeK+HcLH3mbRUDZEesp8fZu0NnvZesxZ4Gqh1zv3dORcCHicSRGL9zjm32TlX6r1uQ9D8BvBH59x73vt4kEjAOylm23u9bZsLnGcDa51zD3k9XI8Cq4BzW/g+LiISuG8BNno9dQ3jEK8AXnTOveicCzvn/gMsAj7Xgp+hLxH52Wt4z79s8rphYKSZpTrntjvnPmphvSLHBAUykU7EC0yznXN9gJFALyIBqcHWJmOHPvHW6Q8kAtu9Q1F7ifTMdPPW6w/cE7OslMhhyd7e9ptjanCx8y0Uu/4nXi15zby/dcDHwLleKPs8kZAGkbFPLwOPmdk2M/uVmSUeZh1/BrqbWUvDS6zYcVY1zcxnNFm/6Xvu5U33B37Y8Fl7n3ffmOVNt22ql7e/WJ8Q+a4OyTm3xzl3o3NuBNCdSE/V/5mZebVd1KS2SUR6Ng/1M9To5yS2Ri+IX0wk+G83sxfM7ISW1CtyrFAgE+mknHOriBxCGhnT3Nv7j7VBP2Abkf8o64A851wX75Hl/aeMt/zqmGVdnHOpzrl3ge1EAgMA3v77xrxGFZAWM9+jmXJj1+9HpLer5ABvreGw5XlEBqev895vvXPuZ8654UQOeZ1D5FBoiznnAkROhPhvYsbBNX0PZtbcezhcTd/zNm96M3B7k886zevpipZ6kP1uIxKOYvUDth5ugc65EuAuImEq16vtoSa1pTvn7uDQP0ONfk68mmJf62Xn3GlEwt0qIr28IuJRIBPpJMzsBDP7oZn18eb7EgkuC2JW6wZ818wSzewiYBiRQ1DbgVeA35hZlpn5zOw4M5vibXc/cJOZjfD2ne1tD5ExRSPM7IveAPPv0jh0LQUmm1k/M8sGbmqm/CvMbLjX63Ub8KR3qK85jxEZm/Qt9vWOYWbTzGyUd6iznEioCx/qc2vGQ0AKcGZM2zLvPRZ6g+9/egT7beoai1xiIhf4MZHDmhAJIt80sxMtIt3MzjazzBbu90UiY+Eus8hJDxcDw4mcgXtIZnanmY30ts0k8jmvc87tBv5BpHfyDG+QfoqZTTWzPi34GfonkZ+9Pt74xhtjXrO7mZ3njSWrAyo5su9O5KilQCbSeVQQGbT/nplVEQliK4DYa029Bwwm0vt0O3Ch9x8tRHqTkoCVwB4iA9F7AjjnngbuJHI4sNzb71neshIi447uAHZ7+3+n4QW9cUaPA8uBxTQfDB4i0pu3g0gY+u6B3qT3H/98Ir1gj8cs6uHVXE7ksOab3n4xs/vN7P4D7bPJ/kPAT4j0CDW0rSESFF8F1gJvN7/1YXmESIDZQOREiJ97r7WIyAD43xH5HtYBs1u6U+/7PIfI974buB44x/ueWiKNyBi4vV5t/YkcGsY5t5lIz+TNQDGRXrEfse//igP+DBEJmi8TCbdLiJwI0sAH/IBI714pMIVIEBQRjx34UjUi0plY5OKeVznnJsW7llhm9gbwD+fcn+Ndi4hIR6UeMhEREZE4UyATERERiTMdshQRERGJM/WQiYiIiMSZApmIiIhInO1309rOJC8vzw0YMCDeZYiIiIgc0uLFi0ucc/nNLevUgWzAgAEsWrQo3mWIiIiIHJKZNb3tWZQOWYqIiIjEmQKZiIiISJwpkImIiIjEWaceQyYiInIk6uvr2bJlC7W1tfEuRY5CKSkp9OnTh8TExBZvo0AmIiLHnC1btpCZmcmAAQMws3iXI0cR5xy7d+9my5YtDBw4sMXb6ZCliIgcc2pra+natavCmLQ6M6Nr166H3fuqQCYiIsckhTFpK0fys6VAJiIi0gnNmzePESNGUFhYSE1NTZu9zpw5c9i2bVt0/qqrrmLlypUH3Wbq1KmNrhO6dOlSzIyXXnop2rZp0yYeeeSRRuu8+OKLR1zngAEDKCkpOeLtD9dnrbcpBTIREZEOKhQKHXDZww8/zE033cTSpUtJTU095L6cc4TD4cOuoWkg+/Of/8zw4cMPax+PPvookyZN4tFHH422tXYga28KZO1oe+V2/rHyH9QGdRaOiIi0rk2bNnHCCSdw+eWXM2zYMC688EKqq6sZMGAAN9xwA0VFRTzxxBO88sorTJw4kaKiIi666CIqKyv585//zD//+U9uueUWLr/8cgB+/etfM378eAoKCrj11lujrzF06FC+8pWvMHLkSDZv3nzA9YYNG8bXv/51RowYwemnn05NTQ1PPvkkixYt4vLLL4/2xMX2fn3rW99i3LhxjBgxIrqvppxzPPHEE8yZM4f//Oc/0bFVN954I/PmzaOwsJA777yTn/zkJzz++OMUFhby+OOPs3DhQiZOnMiYMWM4+eSTWb16NRAJqddddx0jR46koKCA++67L/pa9913H0VFRYwaNYpVq1YB8NOf/pRZs2Zx6qmn0r9/f5566imuv/56Ro0axZlnnkl9fT0AixcvZsqUKYwdO5YzzjiD7du3A5HevhtuuIEJEyYwZMgQ5s2bRyAQ2K/ez8w512kfY8eOdW1p7qdz3cg5I93C7Qvb9HVERKR9rVy5Mt4luI0bNzrAvf32284557761a+6X//6165///7uzjvvdM45V1xc7E499VRXWVnpnHPujjvucD/72c+cc87NmjXLPfHEE845515++WX39a9/3YXDYRcKhdzZZ5/t3nzzTbdx40ZnZm7+/PmHXM/v97sPPvjAOefcRRdd5B566CHnnHNTpkxx77//frTu2Pndu3c755wLBoNuypQpbtmyZfut8/bbb7vp06c755y79NJL3ZNPPumcc27u3Lnu7LPPju73b3/7m7vmmmui82VlZa6+vt4559x//vMf98UvftE559wf/vAHd8EFF0SXNdTQv39/d++99zrnnPv973/vvva1rznnnLv11lvdKaec4gKBgFu6dKlLTU11L774onPOufPPP989/fTTLhAIuIkTJ7pdu3Y555x77LHH3Fe/+tXoe/nBD37gnHPuhRdecDNmzGi23qaa+xkDFrkDZBpd9uIgxnQbA8CSnUsY32N8nKsREZG28LPnPmLltvJW3efwXlnceu6IQ67Xt29fTjnlFACuuOIK7r33XgAuvvhiABYsWMDKlSuj6wQCASZOnLjffl555RVeeeUVxoyJ/L9VWVnJ2rVr6devH/379+ekk0465HoDBw6ksLAQgLFjx7Jp06ZD1v/Pf/6TBx54gGAwyPbt21m5ciUFBQWN1nn00Ue55JJLALjkkkv4+9//zgUXXHDIfZeVlTFr1izWrl2LmUV7sl599VW++c1vkpAQiTC5ubnRbb74xS9G63/qqaei7WeddRaJiYmMGjWKUCjEmWeeCcCoUaPYtGkTq1evZsWKFZx22mlApBeuZ8+eze63JZ/LkVAgO4js5GyO73I8S3YtiXcpIiJyFGp6Nl7DfHp6OhA5inXaaac1GnvVHOccN910E1dffXWj9k2bNkX3daj1kpOTo/N+v/+QJwps3LiRu+66i/fff5+cnBxmz56936UeQqEQ//rXv3jmmWe4/fbbo9foqqioOOi+AW655RamTZvG008/zaZNm5g6deoht2l4D36/n2AwuF+7z+cjMTEx+jn7fD6CwSDOOUaMGMH8+fMPa7+tSYHsEMZ2H8tz658jGA6S4NPHJSJytGlJT1Zb+fTTT5k/fz4TJ07kkUceYdKkSXzwwQfR5SeddBLXXHMN69at4/jjj6eqqoqtW7cyZMiQRvs544wzouPJMjIy2Lp1a7NXiW/perEyMzObDVDl5eWkp6eTnZ3Nzp07+fe//71faHrttdcoKCjg5ZdfjrbNmjWLp59+mhEjRjTab9PXKSsro3fv3kDkxIIGp512Gn/84x+ZNm0aCQkJlJaWNuolOxJDhw6luLg4+l3U19ezZs0aRow48M/GgT6XI6VB/YdQ1K2I6mA1a/asiXcpIiJylBk6dCi///3vGTZsGHv27OFb3/pWo+X5+fnMmTOHSy+9lIKCAiZOnBgdrB7r9NNP57LLLmPixImMGjWKCy+8sNmw0NL1Ys2ePZtvfvOb+11eY/To0YwZM4YTTjiByy67LHpYNdajjz7KF77whUZtF1xwAY8++igFBQX4/X5Gjx7N3XffzbRp01i5cmV0kPz111/PTTfdxJgxYxr1Sl111VX069ePgoICRo8e3ehMzSOVlJTEk08+yQ033MDo0aMpLCzk3XffPeg2Tev9rCwyxqxzGjdunIu9zklb2FG1g9OePI0bxt/AFcOvaNPXEhGR9vHxxx8zbNiwuNawadMmzjnnHFasWBHXOqRtNPczZmaLnXPjmltfPWSH0CO9B73Se2kcmYiIiLQZBbIWKOpexOKdi+nMvYkiItKxDBgwQL1jEtXmgczM/Gb2gZk9780PNLP3zGydmT1uZklee7I3v85bPqCta2upou5FlNaW8kn5J/EuRURERI5C7dFD9j3g45j5O4G7nXPHA3uAr3ntXwP2eO13e+t1CGO7jQXQYUsRERFpE20ayMysD3A28Gdv3oDpwJPeKg8C53vT53nzeMtn2JHcLr0NDMweSE5yDot3Lo53KSIiInIUausest8C1wMNdzPtCux1zjWcv7oF6O1N9wY2A3jLy7z1487MGNNtDEt2qodMREREWl+bBTIzOwfY5Zxr1W4lM/uGmS0ys0XFxcWtueuDKupexJbKLeyq3tVurykiIseOn/70p9x1112Nbt4da86cOXz729+OQ2XSHtqyh+wU4PNmtgl4jMihynuALmbWcMn7PsBWb3or0BfAW54N7G66U+fcA865cc65cfn5+W1YfmNju3vjyNRLJiIiIq2szQKZc+4m51wf59wA4BLgdefc5cBc4EJvtVnAM970s9483vLXXQe6zsQJuSeQmpCqcWQiItJqbr/9doYMGcKkSZNYvXp1tP2hhx6isLCQkSNHsnDhwv22a7h6/rhx4xgyZAjPP/98e5YtbSAeN2e8AXjMzH4OfAD8xWv/C/CQma0DSomEuA4jwZfA6PzRfLDrg0OvLCIicgiLFy/mscceY+nSpQSDQYqKihg7NnI0prq6mqVLl/LWW29x5ZVXNnu9sk2bNrFw4ULWr1/PtGnTWLduHSkpKe39NqSVtEsgc869AbzhTW8AJjSzTi1wUXvUc6SKuhfxv0v/l/JAOVlJWfEuR0REWsO/b4QdH7buPnuMgrPuOOgq8+bN4wtf+AJpaWkAfP7zn48uu/TSSwGYPHky5eXl7N27d7/tv/SlL+Hz+Rg8eDCDBg1i1apVFBYWttpbkPalK/UfhqJuRTgcS3ctjXcpIiJyFGt61afmrgLVknWk84jHIctOqyC/gARLYMnOJUzuMzne5YiISGs4RE9WW5k8eTKzZ8/mpptuIhgM8txzz3H11VcD8PjjjzNt2jTefvttsrOzyc7O3m/7J554glmzZrFx40Y2bNjA0KFD2/stSCtSIDsMqQmpDO86XFfsFxGRz6yoqIiLL76Y0aNH061bN8aPHx9dlpKSwpgxY6ivr+evf/1rs9v369ePCRMmUF5ezv3336/xY52cdaATGQ/buHHjXHPXamlLv1n0Gx7++GHmXzafZH9yu762iIi0jo8//phhw4bFu4wjNnv2bM455xwuvPDCQ68scdHcz5iZLXbOjWtufY0hO0xF3YqoD9fzYXErDwAVERGRY5YOWR6mMd3GAJEbjY/r0WzIFRERaVNz5syJdwnSytRDdgguHG403yWlC8d3OV5X7BcREZFWo0B2EJVvvsnaSadSv2NHo/aibkUsLV5KKByKU2UiIiJyNFEgO4jEvv0IlZZS8frrjdqLuhdRVV/F6j2rD7CliIiISMspkB1E8qCBJA0aROWrrzVq143GRUREpDUpkB1C5ozpVC1cSKi8PNrWI70HvdJ76XpkIiLSLn7605/Su3fv6A3Hn3322bjUsWbNGj73uc8xePBgioqK+NKXvsTOnTtZtGgR3/3udwF44403ePfdd+NSX2emQHYImTNmQDBI5ZtvNWov6l7Ekp1L6MzXcRMRkc7j2muvZenSpTzxxBNceeWVhJucdNZagsFgs+21tbWcffbZfOtb32Lt2rUsWbKE//f//h/FxcWMGzeOe++9F4hvIAuFOu/YbgWyQ0gpKMCfn0fFa40PWxZ1L2J37W4+rfg0TpWJiEhn9/e//52CggJGjx7Nl7/8ZTZt2sT06dMpKChgxowZfPrp/v/HDBs2jISEBEpKSjj//PMZO3YsI0aM4IEHHoiuk5GRwbXXXsuIESOYMWMGxcXFAKxfv54zzzyTsWPHcuqpp7Jq1SogcqHZb37zm5x44olcf/31vPnmmxQWFlJYWMiYMWOoqKjgkUceYeLEiZx77rnR15k6dSojR47kjTfe4JxzzmHTpk3cf//93H333RQWFjJv3jyKi4u54IILGD9+POPHj+edd94BaPY1AH79618zfvx4CgoKuPXWW6Ov9Y9//IMJEyZQWFjI1VdfHQ1fGRkZ/PCHP2T06NHMnz+/lb+hduSc67SPsWPHuvaw7Se3ulVjilyotjbatm7POjdyzkj31Jqn2qUGERFpPStXrox3CW7FihVu8ODBrri42Dnn3O7du90555zj5syZ45xz7i9/+Ys777zznHPO3Xrrre7Xv/61c865BQsWuJ49e7pwOOx2797tnHOuurrajRgxwpWUlDjnnAPcP/7xD+eccz/72c/cNddc45xzbvr06W7NmjXR/UybNs0559ysWbPc2Wef7YLBoHPOuXPOOce9/fbbzjnnKioqXH19vbv22mvdb3/722bfy9y5c93ZZ5+9X63OOXfppZe6efPmOeec++STT9wJJ5xwwNd4+eWX3de//nUXDoddKBRyZ599tnvzzTfdypUr3TnnnOMCgYBzzrlvfetb7sEHH4y+18cff/ywP/+21tzPGLDIHSDT6MKwLZA5cwZ7H3+c6gULyJgyBYBB2YPoktyFxTsX84XBX4hzhSIicqTuXHgnq0pXteo+T8g9gRsm3HDQdV5//XUuuugi8vLyAMjNzWX+/Pk89dRTAHz5y1/m+uuvj65/9913849//IPMzEwef/xxzIx7772Xp59+GoDNmzezdu1aunbtis/n4+KLLwbgiiuu4Itf/CKVlZW8++67XHTRRdF91tXVRacvuugi/H4/AKeccgo/+MEPuPzyy/niF79Inz59jvizePXVV1m5cmV0vry8nMrKymZf45VXXuGVV15hzJjIRdgrKytZu3Yty5cvZ/HixdH7fdbU1NCtWzcA/H4/F1xwwRHX11EokLVA2okn4ktPp+LV16KBzMwY022MBvaLiEi7uPbaa7nuuuui82+88Qavvvoq8+fPJy0tjalTp1JbW9vstmZGOBymS5cuLF26tNl10tPTo9M33ngjZ599Ni+++CKnnHIKL7/8MiNGjODNN9887LrD4TALFizY7+bnzb2Gc46bbrqJq6++utG69913H7NmzeKXv/zlfvtPSUmJBsnOTIGsBXxJSWRMmUzF3Ln0CIcxX2To3djuY5m7eS7F1cXkp+XHuUoRETkSh+rJaivTp0/nC1/4Aj/4wQ/o2rUrpaWlnHzyyTz22GN8+ctf5uGHH+bUU0894PZlZWXk5OSQlpbGqlWrWLBgQXRZOBzmySef5JJLLuGRRx5h0qRJZGVlMXDgQJ544gkuuuginHMsX76c0aNH77fv9evXM2rUKEaNGsX777/PqlWruOyyy/jlL3/JCy+8wNlnnw3AW2+9RW5ubqNtMzMzKY+5MsHpp5/Offfdx49+9CMAli5dSmFhYbOvccYZZ3DLLbdw+eWXk5GRwdatW0lMTGTGjBmcd955XHvttXTr1o3S0lIqKiro37//Z/oOOhIN6m+hjBkzCJWUULNsWbStqFsRAIt3LY5XWSIi0kmNGDGCH//4x0yZMoXRo0fzgx/8gPvuu4+//e1vFBQU8NBDD3HPPfcccPszzzyTYDDIsGHDuPHGGznppJOiy9LT01m4cCEjR47k9ddf5yc/+QkADz/8MH/5y18YPXo0I0aM4Jlnnml237/97W8ZOXIkBQUFJCYmctZZZ5Gamsrzzz/Pfffdx+DBgxk+fDh/+MMfyM9v3CFx7rnn8vTTT0cH9d97770sWrSIgoIChg8fzv3333/A1zj99NO57LLLmDhxIqNGjeLCCy+koqKC4cOH8/Of/5zTTz+dgoICTjvtNLZv3/5Zv4IOxVwnvmzDuHHj3KJFi9rltUIVFaw5+RS6zvoK3bwu4/pwPac8egrnH38+N594c7vUISIin93HH3/MsGHD4l1Gm8nIyKCysjLeZRzTmvsZM7PFzrlxza2vHrIW8mdmkj5hAhX/eTV67bFEXyIF+QW6Yr+IiIh8JgpkhyFz5gwCn3xCYMOGaNvYbmNZs2cN5YHyg2wpIiLSftQ71vkokB2GjOnTAaiIubdlUfciHI6lu5bGqSoRERHp7BTIDkNi9+6kFBQ0ump/QX4BCZbAB7s+iGNlIiIi0pkpkB2mzBkzqF2+nPqdOwFITUhleNfhGkcmIiIiR0yB7DBlzpwBQOXrr0fbiroX8WHJh9SF6g60mYiIiMgBKZAdpqRBg0gaMKDROLIx3cZQH65nRcmKOFYmIiLHittvv50RI0ZQUFBAYWEh7733HgADBgygpKRkv/VPPvlkADZt2sQjjzwSbV+6dCkvvvhi+xQtB6VAdpjMjMyZM6hauJCQd2f6hgvE6rCliIi0tfnz5/P888+zZMkSli9fzquvvkrfvn0Pus27774LtE4gCwaDh1+0HJIC2RHImDED6uupfOstALqkdOG47ON0xX4RETks559/PmPHjmXEiBE88MADhEIhZs+ezciRIxk1ahR33333ftts376dvLw8kpOTAcjLy6NXr16N1qmpqeGss87iT3/6ExC5UCxE7h85b948CgsLufPOO/nJT37C448/TmFhIY8//jhVVVVceeWVTJgwgTFjxkSv5D9nzhw+//nPM336dGbMmNGWH8kxS/eyPAKpo0fjz8uj8rXXyPbu51XUvYh/b/w3oXAIv6/z3+RURETa3l//+ldyc3Opqalh/PjxjB07lq1bt7JiRWQIzN69e/fb5vTTT+e2225jyJAhzJw5k4svvpgpU6ZEl1dWVnLJJZfwla98ha985SuNtr3jjju46667eP755wHo3r07ixYt4ne/+x0AN998M9OnT+evf/0re/fuZcKECcycORMg2iPX9N6V0joUyI6A+XxkTptG+YsvEg4E8CUlUdS9iCfWPMGaPWsY1vXovR2HiMjRZscvfkHdx6tadZ/Jw06gx82HvqXevffey9NPPw3A5s2bCQQCbNiwge985zucffbZnH766fttk5GRweLFi5k3bx5z587l4osv5o477mD27NkAnHfeeVx//fVcfvnlh133K6+8wrPPPstdd90FQG1tLZ9++ikAp512msJYG9IhyyOUOXMG4aoqqr2BlGO7jQVgyS6NIxMRkUN74403ePXVV5k/fz7Lli1jzJgx1NXVsWzZMqZOncr999/PVVddxebNmyksLKSwsDB6Y26/38/UqVP52c9+xu9+9zv+9a9/Rfd7yimn8NJLL3Ek96p2zvGvf/2LpUuXsnTpUj799NPo/RjT09Nb541Ls9RDdoTSTjoJX1oaFa++Rsapp9Izoyc903uyeOdiLh92+H+ViIhIfLSkJ6stlJWVkZOTQ1paGqtWrWLBggWUlJQQDoe54IILGDp0KFdccQV9+/Zl6dKl0e1Wr16Nz+dj8ODBQGRgfv/+/aPLb7vtNm677TauueYa/vCHPzR6zczMTCq8E9Kamz/jjDO47777uO+++zAzPvjgA8aMGdNGn4DEUg/ZEfIlJ5M+eTIVr7+GC4eByDiyJTuXHNFfJSIicmw588wzCQaDDBs2jBtvvJGTTjqJrVu3MnXqVAoLC7niiiv45S9/ud92lZWVzJo1i+HDh1NQUMDKlSv56U9/2mide+65h5qaGq6//vpG7QUFBfj9fkaPHs3dd9/NtGnTWLlyZXRQ/y233EJ9fT0FBQWMGDGCW265pS0/AolhnTk8jBs3zi1atChur1/23PNs+9GPGPDYo6QWFvLP1f/kvxf8Ny984QX6ZfWLW10iInJwH3/8cfRQnEhbaO5nzMwWO+fGNbe+esg+g4wpkyEhIXpvy7HdI+PIFu/U5S9ERESk5RTIPgN/VhbpEyZQ8VrkNkqDsgfRJbmLBvaLiIjIYVEg+4wyZkwnsGEDdRs2YGaM6TZGV+wXERGRw6JA9hllTp8OED1sWdStiE8rPqWkZv97iYmISMfRmcdQS8d2JD9bCmSfUWLPnqSMHEmld7Pxou6R+1pqHJmISMeVkpLC7t27Fcqk1Tnn2L17NykpKYe1na5D1goyZ86g+Lf3UL9rF8PyhpGakMqSnUs4Y8AZ8S5NRESa0adPH7Zs2UJxcXG8S5GjUEpKCn369DmsbRTIWkHmjEggq3x9LjmXXExBXoEG9ouIdGCJiYkMHDgw3mWIROmQZStIOv54Evv32zeOrHsRq0tXUxGoOMSWIiIiIgpkrcLMyJwxk6oFCwhVVlLUvQiHY+mupfEuTURERDoBBbJWkjlzBtTXU/XWWxTkFZBgCTpsKSIiIi2iQNZKUkePxt+1KxWvvkZaYhrDug7T9chERESkRRTIWon5/WRMm0rlW2/hAgGKuhXxYcmH1IXq4l2aiIiIdHAKZK0oc8YMwpWVVC18n6LuRdSH6/mo5KN4lyUiIiIdnAJZK0qfOBFLS6PitVcp6ha5QKzGkYmIiMihKJC1Il9KChmTJlH52utkJ2VxXPZxumK/iIiIHJICWSvLnDmD4K5d1K5YwZjuY1i6aymhcCjeZYmIiEgH1maBzMxSzGyhmS0zs4/M7Gde+0Aze8/M1pnZ42aW5LUne/PrvOUD2qq2tpQxZQr4/VS8+hpF3YqorK9k7d618S5LREREOrC27CGrA6Y750YDhcCZZnYScCdwt3PueGAP8DVv/a8Be7z2u731Oh1/djZpE8ZT8dprjO0+FtCNxkVEROTg2iyQuYhKbzbRezhgOvCk1/4gcL43fZ43j7d8hplZW9XXljJnzCSwfj1dd9XRI72HrkcmIiIiB9WmY8jMzG9mS4FdwH+A9cBe51zQW2UL0Nub7g1sBvCWlwFd27K+tpI5YzoAla9HDlsu2bUE51ycqxIREZGOqk0DmXMu5JwrBPoAE4ATPus+zewbZrbIzBYVFxd/1t21icSePUkZPpyK115nbPexlNSUsLlic7zLEhERkQ6qXc6ydM7tBeYCE4EuZpbgLeoDbPWmtwJ9Abzl2cDuZvb1gHNunHNuXH5+fluXfsQyZs6gZulSxvgHAhpHJiIiIgfWlmdZ5ptZF286FTgN+JhIMLvQW20W8Iw3/aw3j7f8ddeJj/NlzpgJztF18Qayk7N1gVgRERE5oIRDr3LEegIPmpmfSPD7p3PueTNbCTxmZj8HPgD+4q3/F+AhM1sHlAKXtGFtbS55yGAS+/al8vXXGXPxGD7Y9UG8SxIREZEOqs0CmXNuOTCmmfYNRMaTNW2vBS5qq3ram5mROWMGex5+mPFXXcMbm9+gpKaEvNS8eJcmIiIiHYyu1N+GMmfOwNXXM2ZjZF6XvxAREZHmKJC1odQxY/Dn5NDlvTWk+FM0jkxERESapUDWhszvJ2P6NKrfmkdhzkj1kImIiEizFMjaWOaMmYQrKpi2uxur96ymMlB56I1ERETkmKJA1sbST56IpaYy/KNKwi7M0uKl8S5JREREOhgFsjbmS0khY9IppM1fgR+fDluKiIjIfhTI2kHGjBmEdu1ievUAXbFfRERE9qNA1g4ypkwBv5+pG1NZUbKCQCgQ75JERESkA1EgawcJOTmkjRvHgGW7CIQDrChZEe+SREREpANRIGsnmTNmkPjJdnqUOl2PTERERBpRIGsnmTOmA3DG5hwN7BcREZFGFMjaSWLv3iQPH8aEtbB011JC4VC8SxIREZEOQoGsHWVOn0Heut349pazbu+6eJcjIiIiHYQCWTvKnDkDc46xa50ufyEiIiJRCmTtKHnoUBJ792bShiQN7BcREZEoBbJ2ZGZkzpzBCesDfPTpIpxz8S5JREREOgAFsnaWMWMGCcEwvVcWs6ViS7zLERERkQ5AgaydpRUVQXYm49c4Fu/SODIRERFRIGt3lpBA1vQZjF0PH2xdFO9yREREpANQIIuDrJkzSa91lC18N96liIiISAegQBYH6SefTCgpgX5Ld1BSUxLvckRERCTOFMjiwJeaik0oZPwax5IdGkcmIiJyrFMgi5MeZ55HXgVsWPhqvEsRERGROFMgi5PsGTMIG7h5C+NdioiIiMSZAlmcJOTkUHZCLwYuK6YyUBnvckRERCSOFMjiKGXaZPoXOz5c9p94lyIiIiJxpEAWR8edexkAO196Ps6ViIiISDwpkMVR9sDB7OiVQtr8D+NdioiIiMRRiwOZmSWZWYGZjTKzpLYs6lhSduIJ9N5YQfWu7fEuRUREROKkRYHMzM4G1gP3Ar8D1pnZWW1Z2LEi57Qz8TlY98Jj8S5FRERE4qSlPWS/AaY556Y656YA04C7266sY8fIieewKxsqXtX1yERERI5VLQ1kFc65dTHzG4CKNqjnmNM1tStrRnQha/kmwtXV8S5HRERE4qClgWyRmb1oZrPNbBbwHPC+mX3RzL7YhvUdE+pPKSShPkz52/PiXYqIiIjEQUsDWQqwE5gCTAWKgVTgXOCcNqnsGNL31DOoTIHt/34m3qWIiIhIHCS0ZCXn3FfbupBjWVGv8Twz2Jg89x2CpaUk5ObGuyQRERFpRy0KZGb2N8A1bXfOXdnqFR2DeqX34p1p+Uz5qJiSP/wvPf7rx/EuSURERNpRSw9ZPg+84D1eA7IA3YCxlZgZfUdN5K0xSex57FECn3wS75JERESkHbUokDnn/hXzeBj4EjCubUs7tswaPotHTg4R9Bu7fvvbeJcjIiIi7ehIb500GOjWmoUc64Z1HcbMogt5Zryj4t8vUbNsWbxLEhERkXbS0iv1V5hZecMzkcte3NC2pR17vjPmO7x+SibVmYns/PVdOLffsD0RERE5CrX0kGWmcy4r5nmIc+5fbV3csSY3JZfZE77FwyeHqFm0iMo33oh3SSIiItIODnqWpZkVHWy5c25J65Yjlw67lKcmP8GuxZ+QdNdvyDj1VCyhRSfDioiISCd1qB6y33iP3wPvAQ8Af/Kmf9+2pR2bEn2JXHfSDfx9cojA+vWU/d//xbskERERaWMHDWTOuWnOuWnAdqDIOTfOOTcWGANsbY8Cj0WTek8idfpU1vXxs/OeewjX1MS7JBEREWlDLT3Lcqhz7sOGGefcCmBY25QkANeN/xEPT/cTLi6h9MG/x7scERERaUMtDWTLzezPZjbVe/wJWN6WhR3rBmQPYPzpX+H9wcauB/5IsLQ03iWJiIhIG2lpIPsq8BHwPe+x0muTNnR1wdU8f0YOrraGkv/933iXIyIiIm2kpZe9qAXuB250zn3BOXe31yZtKCMpg0vPuI7XCozSRx8l8Omn8S5JRERE2kBLLwz7eWAp8JI3X2hmz7ZhXeI57/jzWPb5EwhYmO3/85t4lyMiIiJtoKWHLG8FJgB7AZxzS4GBbVOSxPKZj+/M/AnPTYDql16hZrmG7omIiBxtWhrI6p1zZU3aDnpfHzPra2ZzzWylmX1kZt/z2nPN7D9mttZ7zvHazczuNbN1Zrb8UBelPZYUdiuk9uKzKEuDzXfcrlsqiYiIHGVaGsg+MrPLAL+ZDTaz+4B3D7FNEPihc244cBJwjZkNB24EXnPODQZe8+YBziJy0/LBwDcAjWKP8Z1TfsQzk5MJLVlO5ZtvxrscERERaUUtDWTfAUYAdcAjQBnw/YNt4Jzb3nBrJedcBfAx0Bs4D3jQW+1B4Hxv+jzg7y5iAdDFzHq2+J0c5Xqk92Dgl7/Bthz45I6f40KheJckIiIiraSlZ1lWO+d+DExxzo13zv3X4ZxlaWYDiFzd/z2gu3Nuu7doB9Ddm+4NbI7ZbIvXJp6vjL6SV87Mw79pK6VPPRXvckRERKSVtPQsy5PNbCWwypsfbWZ/aOG2GcC/gO8758pjl7nIYKjDGhBlZt8ws0Vmtqi4uPhwNu30UhJSmDnrFtb0gq13/1q3VBIRETlKtPSQ5d3AGcBuAOfcMmDyoTYys0QiYexh51xDl87OhkOR3vMur30r0Ddm8z40c79M59wD3j01x+Xn57ew/KPHzP6nsejCESSWVrDtb3+KdzkiIiLSCloayHDObW7SdNBBTGZmwF+Aj51z/xOz6Flgljc9C3gmpv0r3tmWJwFlMYc2xWNmXHHJ7SwabJT+6c8E9+yJd0kiIiLyGbU0kG02s5MBZ2aJZnYdkUH6B3MK8GVgupkt9R6fA+4ATjOztcBMbx7gRWADsA74E/D/DvO9HDOG5g6ldPbZ+GvrWX/3HYfeQERERDo0a8k1rcwsD7iHSIDyAS8D33PO7W7b8g5u3LhxbtGiRfEsIW721u7lsSunMmlZgMH/fonkfv3iXZKIiIgchJktds6Na25ZS8+yLHHOXe6c6+6cy3fOXRHvMHas65LShdxrvkm9z/HRL26OdzkiIiLyGbT0LMtBZvacmRWb2S4ze8bMBrV1cXJwX5x4Fe+e2pXUNxZTtnRxvMsRERGRI9TSMWSPAP8EegK9gCeAR9uqKGmZBF8CRdfeRlkafPzfN+mWSiIiIp1USwNZmnPuIedc0Hv8A0hpy8KkZSYOns7yc4aS/dFmtv7nuXiXIyIiIkegpYHs32Z2o5kNMLP+ZnY98KJ3o/DctixQDu3M7/0PO3KMT++8XbdUEhER6YRaGsi+BFwNvA7MBb4FXAIsBo7N0xw7kH5dB7HjyzPI2VrOiod+F+9yRERE5DAdNJCZ2Xgz6+GcG+icGwj8DFgBPAeM9do1uL8DOPeqX7KpdyLV//sXgtVV8S5HREREDsOhesj+CAQAzGwy8EvgQaAMeKBtS5PDkZGUgf+7V5JVVs/8e/4r3uWIiIjIYThUIPM750q96YuBB5xz/3LO3QIc37alyeGa+fnvsmZ4FumPvUz5rv1uAyoiIiId1CEDmZkleNMziIwha5DQzPoSRz7zMfDGW0kOON75xbXxLkdERERa6FCB7FHgTTN7BqgB5gGY2fFEDltKB1Mw4XOsnzSAPq98yCcfL4x3OSIiItICBw1kzrnbgR8Cc4BJbt+VR33Ad9q2NDlSE/7rfwj5YNntN8S7FBEREWmBQ172wjm3wDn3tHOuKqZtjXNuSduWJkeqR/9h7Pr8iQxetIP35j4S73JERETkEFp6HTLpZE694TdUpvko/vVvCIQC8S5HREREDkKB7CiVmt2V0OwLOG5DNS8/+st4lyMiIiIHoUB2FJtw9Y/Zm5dC8gNPUFpVEu9yRERE5AAUyI5ivuRkun7vu/TdFeKFP/wo3uWIiIjIASiQHeUGXzibPQO7MvCfC1i1fXm8yxEREZFmKJAd5cyM4398G10r4I3/+RH7rlwiIiIiHYUC2TGg26TplI8bzLhXPuX1Fc/EuxwRERFpQoHsGDHyljtJDcCau39BbbA23uWIiIhIDAWyY0T60GEEz5rMpAUVPD733niXIyIiIjEUyI4hw2+4Dfw+gn98iB1VO+JdjoiIiHgUyI4hid27k3bFJZy8IshPH7iU1z55TYP8RUREOgAFsmPMgP/3fcI52Xz7TztYfd13+OFDl/Lx7o/jXZaIiMgxTYHsGOPPzGTI0/9H18suZ8rqBL72i2W8e9UF3P3Y9yip0dX8RURE4sE68yGrcePGuUWLFsW7jE4ruHs32//6AHsefpTE2no+GJyA76tf4gvnXU+yPzne5YmIiBxVzGyxc25cs8sUyCRUVsbGv/6ein88RkpVPWuOSyH7G19j8rn/D59PnagiIiKt4WCBTP/bCv7sbI6/9mZGvzWf2m9dTM9dQbrf8Hte+9yJfPTs3zXwX0REpI0pkEmULz2dMd/7KWPeeo/t3zyX1L3V+K7/Je+ePpHNzzyOC4XiXaKIiMhRSYFM9pOYmsb07/+K0a+9w4ffmEJtVTmVN/yUJaedSvG//omrr493iSIiIkcVBTI5oMz0LnzpB/cz+MUXee2qQnYF91Dy41v5cOZUSh95lHBdXbxLFBEROSookMkh9esygG9f9yj5j/+dh7/an43+Unbedhurpk9j99/mEK6ujneJIiIinZrOspTDEgqHeHbdM7z01F3MmLuXUZ84LDuLvNmzybn8cvxZWfEuUUREpEPSZS+k1VXVV/GXD//C26/8lfPfCTJmbQjLSCf3ssvJnT2LhNzceJcoIiLSoSiQSZvZVrmNuxffzcfv/ZtLFiZS9FEdvuRkunzpIrpeeSWJPXrEu0QREZEOQYFM2twHuz7gVwt/xe41H/LVJdmM/qAM8/nocv755H372yR27xbvEkVEROJKgUzaRdiFeWHDC/x28W9x23fy7ZV9GfbuVnyJieRdcw25X/kylpgY7zJFRETiQoFM2lV1fTVzPprD31b8jdzSAFe/nsiwVVVU9Mpm+zfOJvuUU+mT0Yfemb1JTUiNd7kiIiLtQoFM4mJH1Q4eXfUom8o2kb7wY854Zgvd9jjmn2D8fYaP3VlGXmoefTL60CfTe2Tse85Py8dnujKLiIgcHRTIpEMI1day9YHfU/nnB3FmfHrBBBZO6c6ntdvZUrGFHdU7CLtwdP0kXxK9M3s3Cml9MvvQO6M3fTP7kpaYFsd3IyIicngUyKRDCWzZys47fknlq6+RNGAA3X/8YzJOnUR9qJ7tVZFwtqVyS6PnzRWbqayvbLSf3JTc6KHPPhl9GJwzmCl9piioiYhIh6RAJh1S5bx57Pz57QQ++YTM02bS/cYbSezdu9l1nXOUB8oj4axycySsxQS2HVU7CLkQqQmpzOg3g3MHncuJPU/E7/O387sSERFpngKZdFjhQIDSv82h5P77wTnyrv4GuVdeiS85+bD2Ux+uZ3nxcp7f8Dwvb3yZivoK8lPzOXvQ2Zwz6ByG5g5to3cgIiLSMgpk0uHVb9vGzjt/RcXLL5PYrx/db76JzKlTj2hfdaE63tz8Js9teI63t7xN0AUZkjOEcwedy+cGfY5uabommoiItD8FMuk0qt59lx0/v53Ahg1kTJtG95tvIqlv3yPe357aPby06SWeX/88y0uW4zMfJ/Y4kXOPO5cZ/WZovJmIiLQbBTLpVFwgQOlDD1H8+z9AMEjXq66i6ze+ji8l5TPtd1PZJp7f8DzPb3ierZVbNd5MRETalQKZdEr1O3ey61e/pvyFF0js3ZvuN99ExvTpmNln2q9zjg92fcBzG57j5U0vUxGIjDf73MDPce5x52q8mYiItAkFMunUqt5byM6f/zd1a9eRPvlUetx8M0kDBrTKvutCdby15S2eW/8c87bOIxgOMjhncGS82cDP0T29e6u8joiIiAKZdHquvp7Shx+m5L7f4QIBcq+8kryrv4EvrfXGgO2t3ctLm17iuQ3Psbx4OYZxYs/IeLOZ/WZqvJmIiHwmcQlkZvZX4Bxgl3NupNeWCzwODAA2AV9yzu2xyDGoe4DPAdXAbOfckkO9hgLZsSdYXMyuu+6i7JlnSejVk+433Ejm6ad95sOYTX1S/gnPb3ie59Y/Fx1vNr3fdM4ddC4n9TxJ481EROSwxSuQTQYqgb/HBLJfAaXOuTvM7EYgxzl3g5l9DvgOkUB2InCPc+7EQ72GAtmxq3rxYnbc9t/UrV5N+skn0/2//ovkQQNb/XWccywtXspz65/jpU0vURGoIC81jyuGXcGsEbNI8CW0+muKiMjRKW6HLM1sAPB8TCBbDUx1zm03s57AG865oWb2R2/60abrHWz/CmTHNhcMsuexxym+5x7CtbV0+cIXSB48mMRePUns2ZOEnj3xd+nSar1ngVCAt7a8xZNrn+Sdre8wOn80v5j0C/pl9WuV/YuIyNHtYIGsvf+87x4TsnYADSOmewObY9bb4rUdNJDJsc0SEsi94nKyzjqTXf/zP5Q98wyurq7xOqmpJPZsCGg9vOle+0Jbjx4tvitAkj+Jmf1nMrP/TF7c8CI/f+/nXPjchVw37jouGnJRqx82FRGRY0d795Dtdc51iVm+xzmXY2bPA3c459722l8DbnDO7df9ZWbfAL4B0K9fv7GffPJJm9UvnYtzjtCePdRv20799m0Et2/3phse2wgVl+y3nT8vLxraEnv2JLFXpHetYd7ftWuzYWtH1Q5+8s5PmL99PpN6T+K2k28jPy2/Pd6qiIh0QjpkKeIJBwIEd+yIBLUd2/cPbdu24WpqGm1jSUle71qvaEhLGz+OtJNOwuF4bNVj3L34bpITkrnlpFs4Y8AZcXp3IiLSkXWkQ5bPArOAO7znZ2Lav21mjxEZ1F92qDAmciR8SUkk9etHUr/mx3055wiXlcUEtMa9bVXvvktw1y74gyN58GByZ32FS869gIm9JnLzvJu57s3rmLt5LjefeDNZSVnt/O5ERKSzasuzLB8FpgJ5wE7gVuD/gH8C/YBPiFz2otS77MXvgDOJXPbiq80drmxKPWQSD+G6OspfeJHSBx+kbvVq/Lm55FxyCRmXXMScbU/zx+V/JC81j/8+5b+Z2GtivMsVEZEOQheGFWkDzjmq33uP0jkPUvnGG1hiIlnnnMOe807hph1/ZGPZRi474TK+P/b7pCakxrtcERGJMwUykTZWt3Ejex56iL1P/x+upoaUkybwxsmZ3J30BgO6DOIXk37ByLyR8S5TRETiSIFMpJ2E9u5lzxNPsOcfDxPcuZNQ3x48WVjDv4fWMHvcN7mq4CoSfYnxLlNEROJAgUyknbn6espffoXSOXOoXbGCurREXhwdZNPMYdx0zl0MzG79uwqIiEjHpkAmEifOOWqWLKF0zoOUv/YqIRwLhyWQO3sW553zA3zmi3eJIiLSThTIRDqAwJYtbP3rA5Q/9RRJtSG2DspiyLd+SN/PXYD5dbNyEZGj3cECmf48F2knSX36MPAntzFi3ny2XXkGSaUVVP3oVj6cMZnSBx8kVFkZ7xJFRCRO1EMmEieb9mzgofu/zcjXNnLCFrCMdHIuuJCcL3+ZpD69412eiIi0Mh2yFOmgguEgf1vxN1568Xect9jHuJX1mHNkzpxJ7uzZpI4p1E3LRUSOEgpkIh3cx7s/5qZ5N7Fn8zqu3TiUYW9vIVxeTkpBAdlnf470yZNJGjBA4UxEpBNTIBPpBOpCddy75F4eWvkQx6X04bayaaQ/N4+6tWsBSOzXj4zJk8mYMpm0CRPwJSfHuWIRETkcCmQincj7O97nx2//mJ3VO7lq1FVcljUTe+8Dqt58i6r33sPV1mIpKaSfdBIZUyaTMXkyib015kxEpKNTIBPpZCoCFdyx8A6eXf8shjG863BO6nkSJ+aOYegnQQJvL6DyzTep37wZgKTjjyNj8hQyJk8mbWwRlqi7AYiIdDQKZCKd1IfFH/L21rdZsH0By4uXE3RBknxJjOk2hpN6nsiJ9f3ouXwb1fPmUfX+Iqivx5eeTvopp5AxZTLpp55KYrdu8X4bIiKCApnIUaGqvorFOxezYPsC3tv+Hmv2rAEgMymT8d3Hc3LOGIo2J5L+/mqq3nqL4M6dACQPHxYZezZ5CqmjC3QRWhGROFEgEzkK7a7ZzcIdC1mwfQELti1gW9U2ALqldeOkHicyuW4AJ6yuws1fTM0HSyEUwp+dTfqkSWRMnUL6pEkk5OTE9020glA4RHWwmqr6KqrrI89VwSqq6quoC9ZR1L2IHuk94l2miIgCmcjRzjnHlootLNgRCWcLdyxkb91eAAZlD2JSViGnbM2g9/Id1L2zgFBpKZiRWlBA+pRI71nK8GGYr+1v3uGcoyZYEwlOXniqrq9uFKai096jur46GrJil1UHq6kJ1hz09RIsgTMHnsnsEbMZmju0zd+fiMiBKJCJHGPCLszq0tXRw5uLdy6mNlSLz3yMzBnBabWDKFwfImvxOupWfATO4cvMJCE/H39ODv6cLiTk5ODvkuPNx7Tl5ODr0oW6FD8V9RWUB8opqyujPFBOeV154/mGR13j55ALteh9pCakkp6YTnpiOmkJafumE73phCbzTdY1M55d/yz/WvMvqoPVTOw5kdkjZjOx10Rd001E2p0CmcgxLhAKsKx4WTSgrShZQciFSPYnc0rqSGZsz6X/p7WE95QR3rsXK6vAV15FYkUNvlDz/0YEfVCZCuWpUJEGFalGRVpkvjLNR31mKi47A7Kz8Od0ITGnK2lZOWQlZ5OZlLkvSMWEqqbzPmudHrvyQDlPrH6Chz9+mOKaYobkDGH2iNmcOeBMEv06I1VE2ocCmYg0UhmoZNHORdGAtm7vumbXy0zIoBuZdA+kkl+fQte6RHJq/WTX+siodqRVBUmprCepshZ/eTW+skpcWTmEw83uz5KS8OfkkNinD6mFo0kdPZrUwsJ2OxO0PlTPCxtf4MGPHmTd3nV0S+vGl4d9mQuGXEBmUma71CAixy4FMhE5qOLqYtbtXUdmUiZZSVlkJWWRmZSJ33f4Z2S6cJhweTnBPXsI7dlLaG/s8x6CpXsIrF9P7cqVuPp6ABJ79SK1sDDyGFNIytChWFJSa7/NfTU6xzvb3mHOijm8t+M90hPTuWjIRVw+7HKdACAibUaBTEQ6nHAgQO1HH1GzdBk1S5dSs3Rp9FIdlpxMysiR7dKLtnL3SuZ8NIdXNr2CYToBQETajAKZiHQK9Tt2RMLZB5GA1p69aNsqt/GPj/+hEwBEpM0okIlIp9SoF21ZpCctuGMHENOLNnp0pCetlXrRyurKeHLNkzoBQERanQKZiBw12qsXLRAK8OLGF4+5EwBqgjV8VPIRy4qX8dHuj+iW1o1pfadR1L2IRJ8CqchnoUAmIketcCBA3cqVVC9dGh2PFu1FS0rCn9cVf3o6vrR0fOlp+KLTB3uk7VsnLY35ZUt5cM3DR90JAM45tlVtY9muZSwtXsqy4mWs3b2atMogeeVwfDCXTxPLWdMtSHpqNpP7TGZa32mc0vsU0hPT413+YakN1rJk5xLmb59PcU0xY/LHML7neAZmDdQhaWk3CmQickyJ9qIt/5DQnj2Eq6qafYSqq8HrXTsUS0wknJZCZUKQPb5aapIhIzuf3t2OJ7tLd3yZGZEL5+Z2xZ+bQ0LXriTk5uLPzcWXmdkh/tOvqa/h440LWbt6Pts2LGfv5vUk764grxzyK3z0qEogq6weX7DxZUtcUiK7BmSzuFsly3sE2Ng3iRHHncS0vtOY1nca+Wn5cXpHBxZ2YdbsWcO7295l/rb5LNm5hEA4QIIvgS7JXSipKQEgLzWP8d3HM77neMZ3H0//rP4d4ruSo5MCmYjIAYQDgQMEtup909WNl1WV7Wb7rg2U791Jcl2Y7FAS6QEfvura5l8kMTES1rp23fecGwlvCV0joa0hvPlzu+JLTzuiUBCqrCK4Yzv127dTv207ezavo+STVdRu3YIVl5Kxt46UJvkznODDl59HSq8+JPXsRWLPHiT06EFiz54kdOtO/dat1CxZQvUHH1C7ciUEgwDszE9kRa8ga3obbtRQCorOZHr/GQzKHhS3QLOjagfzt81n/vb5vLf9PUprSwE4vsvxTOw1kYk9JzK2+1hSE1LZXLGZhTsW8v6O93l/x/sU1xQD0C21WzScTegxgT6ZfRTQpNUokB0p56B2L6R2/hswi0jrK6sr44k1kTsAlNSUkBB0ZFVDfl0yfYKZ9AikkV+XRG6tn+wqSK8KkVJRR2J5Nba3Aqqbvw+nJSUdOLzl5ILP5wWvHdTv2E7Qew5XVDbaT9hgTwbsyfIRzO9CSs8+5PYfQp/jCsntP5iEHj1IyMtr8T1Mw7W11H74IdUfLKV6yRKqliyG8gogcoeGNb2NncflkDN+IoWTL6Cwz4QjupZdS1XXV/P+jveZv30+87fNZ0PZBgC6pnSNBLBeEzmp50l0Szv4yR7OOT4p/6RRQNtduxuA7mndmdBjAuN7jGd8j/H0yezTZu9Hjn4KZEdqwxvw6GVw0jdh4rchLbftXktEOq1AKMCinYsori5md+1uSmpK2F2zO/Lw5htu9h4rsd7Roz6NfsEsetan060uma61iXSpNjKqw6RWBEgur8VXVgl7ynC1jXvgQl0yqOiSzI6MIJ+kVLIr07E7C3zd8+k5cBTHHz+B0b2KGJIzpE0G5LtwmMCmTdQsWcLuhe9Svvh9UrZGDgUGffBprwTqhg2k+8QpFEz/Epk9+n6m1wuFQ3y0+6NoL9iyXcsIuiAp/hTGdh8bDWBDcoZ8pl4t5xwbyzby/o73WbhjIYt2Lor2tvVK78W4HuOiIa1XRq/P9J7k2KJAdqR2r4fXfw4fPQXJWTDxGjjpW5CS3XavKSJHpfpwPaU1pY0DW+3uaHArqY20ldSUUB4ob3YfXV0G/ULZBIK1rEkooT7BSPYnM6LrCEZ3G83o/MgjLzWvnd/dPsE9e9jz/nzWv/0CtUuWkruxlETvXvJleakw6gT6TJxB3omnknz8cZj/4D1omys2M3/bfBZsX8CC7QuoCER65IblDov2go3pNoZkf3KbvSfnHOv3ro+Gs/d3vB8N2L0zejfqQevsJ3pI21Ig+6x2rIA3fgmrnoeULnDKd2HC1ZCc0favLSLHnEAoQGltaTSgxYa4kpoSEnwJFOQXUJhfyJDctun9ai2B2mqWvfM0m+b9m9DylfT/pIYuVZFlwbQkkgpGkTvhZFJHjcKSkqmqq2B1yUpWFUcee2pK8IUhN6kLQ7MGMyT7OAZlDiTdn4ILhnDhEITCEA7hQmFcKAihcLR9//kQvpRkkgYMIGnQIJIHDcKffXh/ZIddmLV71rJo5yIWbo+EtIYQ3S+zXzScje8x/pCHS+XYokDWWrZ9AHN/AWtfgbQ8mPR9GH8VJKa2Xw0iIp2Uc45VpauYv+j/2Db/NbJWb2PIFkffEvC11X9FPh/4/ZjfHxkr5/cTrq1tdHatv2tXkgcN8gLaQJIGHUfyoIEk9OzZovF1DWd0Lty+kPd3vs/iHYupqI/05OWl5tEnow99Mr1Hxr7n/LR8fNay8XtydFAga22bF8Lc2yNjzDK6w6nXwdhZkNB2XeYiIkebbZXbmLt5Lu+u/g9lHy2nf0ZfhncbwYhuBRyXO5iEhGTM7wN/QuTZ5z/AfCRw7T/va3YsmQuFqN+yhboNGwhs2EjdhvXe8wbCZWXR9Sw1laSBA0geOIikQQNJPu44kgYOImlAf3zJB/73PhQOsXrPat7f8T7r965na+VWtlRsYUf1DsJu3yVFknxJ9Mro1TioxUy397XenHOUB8ob9cw2nd5du5vaYC39svoxKHsQA7MHMih7EIOyB5GRpKNGh6JA1lY2vRMJZp+8A1l9YPJ1MOYK0O1VREQ6HeccodJSAhs2ULdhI4EN6yPP69dTv23bvhV9PhL79CF54ECSjjvO61XzDn926XLA/deH6tletZ0tFVvYUrml8XPFlmivWoOc5JwDhrXuad1bdAZrbMhqOn6x6XRpbSnBcHC/fSRYArmpuXRN6Upeah6JvkQ+rfiUTeWbGq3fLbUbA7vsC2iDsgcxqMsguqZ01aVDPApkbcm5SE/Z3Nthy/vQpT9MuQEKLgZ/QnxrExGRVhGuqSGwaRN16zdEAtvGDQTWbyCwaRMuEIiu58/NjfSmDTou8jxwIEkDB5LYqxeWcPD/E8rqyhoFtIbprZVb2V65naDbF34SLIGeGT2jAa1nek+qg9XN9mgdKmR1TY0ErQNNZyVlNRuoguEgWyq2sKFsAxvKNrCxbCMb9kamq4PV0fUykzIb9aQ1PHpl9NovVLpgkFBFBeHyckLeJVX8WZn4srLwZ2Ye8jPs6BTI2oNzsPY/kWC2fSl0PR6m3AgjvwhteB0eERGJHxcKUb9tG3XrI4c9Axs3RELb+vWEYg5/kphIUt++JA0cSNKA/pGgNmAASQMG4O966B6kYDjIzuqdzYa1LRVb2FO3Z7+Q1dCj1dx0VnJWm4xfc+EwocpKdu7awJbtq9mxYx27d33Knt1bqdq9EyqrSK+D9FrIrPORG0wmK+AnrcaRVFOPr6buoPv3paXhy87Gn5mJPysrGtR82Vn4M7PwZ2fh8579mV6Qy4q0HekFl1uTAll7cg5WvRAZ/L/rI8g/AabeBMM+HxlcKiIix4RgaSmBTZsIbNwYed60ibqNG6n/5FNczEkFvszMaDhLGjiA5AEDIsGtf398aWkteq3aYC1J/qRWC1nOOcLl5QRLSwnt2UNoz57IdGlkOlRRTri8glBFBaHysuh0uKICwuGD7zsjjfq0JGpSfVQkhSlNqGN3Yi1VyVCVYtSkGEldcsju2ouc5BwyA37SayG91pFSEyK5pp7Eqnr8VbX4qmqwiipcRSXhysqDvi5+f+PwlpWJL8sLd9lZZEyZQtr48a3y+R2IAtkRqq0PsbO8lv5dj2BgZTgMK/8vcrmMkjXQfRRMuxmGngU6li4icsxq6FWLhLVNBDZFAlvdpk0Et21vtG5C9+7RXrWkAQOiPWuJvXsf1uE7FwwS2rs3JlSVemFrL6HSUoJ7vPbSUoJ79xDaszd6m6ymLDUVf5YXaDK9HqiszMY9VFmZ+DIz8Wdl7zvkmJWFLz292WvP1QZr+aT8k+jhz4ZDn8U1xVQGKgm50EHfn2Fk+NPID6fTNZhCXjCF7PpEutQlkBXwkxEw0mscKbVhUqqDJFXXk1BVi7+qFquswVVUkvf975J/5dda/JkeCQWyI/TKRzv4xkOLGZSXztSh3Zh2Qj4TBuaSnHAYhyDDIfjwSXjzDijdAL2KYNqP4fgZCmYiItJIuKaGwKef7gtqGzdFw1q4uUOgXq9aUt++hGtroz1YjQLWnj2Nt23Cl50duU1Xbi7+nJzIrbpycvHn5kTusZoT056biy8lpR0+iX2cc9QEa6isr6QyUBl9rqiviM5XBCr2PQcqqaqv2m95fbj+oK/zvcLvctXor7fpe1EgO0I7y2t5acUO5q7exfz1u6kLhklL8nPK8XlMG9qNqUPz6dWlhdcgCwVh2aPw5q+g7FPoe2IkmA2a0mb1i4jI0cE5R2jv3sjhTy+kNfSsBT75dN+JBX5/JDy1NGB16YIlHhtXBqgL1UUDW0NIq6qvioa5Md3GMDJvZJvWoEDWCmoCIeZvKGHuqmJeX7WLrXsjNwU+oUdmpPdsaD5F/XNI9B/i+H0wAB88BG/dBRXbYMCpkWDWf2I7vAsRETnauFCIYHExvpQUfFlZLb5ZvLQ/BbJW5pxjfXElc1cVM3f1LhZuLCUYdmSmJDB5cD5Th+YzZWg+3TIP0q1bXwuL58C830DVLjhuOkz4BmT1hvR8SM/T9cxERESOIgpkbayitp531u3mjdW7mLt6FzvLI6ftjuqdzbSh+Uw9oRuj+3TB72tmzFigGt7/M7zzW6je3XhZao4XzrpFAlp6fuSRkb9vuuGRnKkxaSIiIh2YAlk7cs6xcns5b6wuZu6qXSz5dA9hB7npSUwZEuk9mzw4n5z0pMYb1lXCjuVQVew9SiLPlbv2TVftgtoDDMz0Jx84rDX0uGV0i0yndVXvm4iISDtTIIujvdUB3lpbwhurdvHGmmJKqwL4DMb0y4n0ng3txohezV8FuVnBAFQ3hLXimAAXG9xilh3orJK0rpDZM3IvzsyekNkdMnpAZswjo7vuzykiItJKFMg6iFDY8eHWMuauihzaXL4l0tvVLTOZqUPzGTcgl26ZyeRnJpOfkUxuehIJhzpJ4GCci/SoVRU3flQWQ+VOqNgBlTu8513Q3HVeUnP2D27R+Ybg1gMS2/c0aBERkc5GgayDKq6o4801kRMD3lpTTEVt44vwmUFOWhJ5GUnkZSSTlxEJa5HpJPK84JaXkUzXjKRDn+F5MOFQZAxbxXao2Bl5bght0eC2M/LczH3RSOmyr1etUXDrBsnZkJwBSRmRsW7JmZHphKT99yMiInKUUiDrBIKhMFv21FBSWUdJZR3FlQFKKuoorqyjpKLOaw9QUllHdaD5KxbnpCVGg1teZnI0yOXHBrnMJLqmJ5OUcIThLRyGmtJ9wa1yRzMhzmsPBQ6+L3+SF9IyIMkLatHg1tCWsS/ARZ8b1snaN52UrpMaRESkQztYIOvct00/iiT4fQzIS2dA3qFv01QdCFJSEaC4spbiikA0xJVU1lHizS/fspeSijqqDhDe0pP8ZKYkkpGSQGZKAhnJCWSlJJKR7M2nJJCZkkhmSgKZyQmN1s1MziAzdzgp3UceeOybc1CzJxLQ6iqgrjxy4kKg0nuuaDxfVxFpqy6FvZ/GLKsAWvJHgzUJa5mNQ93hBr+j/aQH56C+xvvcKyPfTyjonfzRHZJadv88ERFpHQpknVBaUgL9uibQr+uh/9OsCYS8HreGnrZIYCurqaeitp7KuiAVtZHHtr01VNQGqawLHrAXLlaCz8jwwlzj8LYv0GUkJ5KWlE9aUg9SkxJIS/aTluknNclPWlICaUkN035SEvz4ml4axDkIVB0kyJXHLKuMCRjedPUnjdtCdS37kP3JzR9mbWhLTIscck1IiaybkOQ9ew+/tyw63dCe3GQ6ZrtD9fA5B8G6fQG34f3GBtq6igO0Ve5b1tDmDnID4KTMyOHmjO7NPHf3TgLpDml54Nc/IyIin1WH+pfUzM4E7gH8wJ+dc3fEuaROLzXJT9/cNPrmHl6PRzAUpqouRHmj0BaZLq8NUhkz37CsojbI9rJa1u7aNx8MH94h8dREfzSkxU6nJSVEnhP9pCXlkJqUT1pSzPI0P8kJfhqOwDtczHTkciQAFq7HX19FYrAKf7ASf30VCcEqEuqr8AerSQxWkRCsjLTFPBKrqkko2x6ZDlbhD9fiDwfwhwIYBwk2h8OftC+kJaR480mRnqyGENXc+L39WOPevobev8zujXsIo+t4h359CZEzdSt3Rk7yaHje+RGsnwt1zV1yxfb1qjUX3GLbUroc3mHlcDhy2DtY6z3XRR6hhmdvWTCwry12ecM6AOaLPHx+b9ofM29N5mOX+5pZv2HeDrC/g72Wr8nygy1rpk4ROWp1mEBmZn7g98BpwBbgfTN71jm3Mr6VHZsS/D6y03xkpx35oTvnHHXBMNWBENWBIDWBkDcdoqY+0gtXEwhRUx/THtjXXh0IUV0fadtZXhuzfZDa+jCBUGsEIR+Q6T2OjJ8QSdSTTD1JBEmyyHR0nnqSrZ4k6kn3h0j3B0nzhUj1hUjzBUn1BUnxBUmxIKlWT7IFSSZIcqiexFCQoD+FurQ0ajPSqfOnEfCnEfCnE/ClU+tP9+bTqPenU+dPp96XAj4fhkX/Dze83GGGAdQBARqt4zPw+wbiNyMhxfCnGf7uht9nJPiNxHAdGfWlpNXvJi2wm7RACal1u0kJlJBSW0Ly3hKSd6wmqaYYX3j/8YNhXxKhtHxCad1wialYqA7zQpOF6rxHwHvU4TvEjYCPPdY4yPn8MdMJ3nxCM20tWSfB23cz6wDgIj2qrulzOGZZk+XNtsdu02RfuH0h1Ofb9x6jz82F2AOt29L2puH7QK9jR75Nwy9YdLy2a36+0eQB1mk05ruZttg/LswOEPJ9TZY3bffFvLfmHt57aunPwQGXxba7A+8LjuB7bdLeSXSYQAZMANY55zYAmNljwHmAAlknZWakJPpJSfST2/RCuK2gPhSmpn5feKsLhpoNIQ1zjdttv3Vit4V9Aaa59YLhMHXBMIFg5LmuPkQgFKbOC4p1wdC+6ehzKLJuMEx1MMzeoLdedD+hffuLaQs3+nc30vMX/WfaOa8HECCAc3V4qzW7jvNao/uI6VEMu8ilWQ7NgDzv0RxHFtXk217yrYx8vGfbS7f6veSX7SXZSqlziQRIJEAadSQScIkESIhM4027pJi2hJhtEqgjiYBLIEDivuUkNlonQOQPCh8OH2F8hPETjpl3+Alj3rOPMD7bvz3BHIk+R6KPyLM1nk7wnn2EMe81cA378Pbr9i33RR9em4ttC2PONZr3uUgt+2oO4Q+H8bswCYTxW5gEQvhwJBCKLCdMgkW2jywL43d1+KiJbNPQFl0W8vYfwue8du9SOM58OK8CB14VFnlY5B3RpD3csDw67Wumbd8yvCX+6He077nptA+HL/qZhaLT5n1WPkL7PlNvXo5dYfNHfobNDxguZj76jI/y8d+jx4z/F7c6O1Ig6w1sjpnfApwYp1qkE0j0+0j0+8hKOcoH4Lcj5/YFs1DYEQyHCYchGA5H2pwjGHLR6VA4Mh92jmDYEQqHCcWuH92PI+w9l4Yj6/vMoqE3AUjy5iPDCBumLRqGfWbQTFtDwPYW4/PtWw6Gi9bmqA+Fo/UEQy5aZ8N8KByOWXfffGTdg88Hwi5SQIymBxljT4LZf1nLt234nsIuEtDDznmPxt/hgZZHljXez775yHLnIORc5DP1PtyG6ej3ETPf3PcR+1019M42Xb9hH7Dv/TT8fDXUGnIQDu+rrWmNIe/nK+zNh8OOEDFtYYdzIcyFcOFIWPObi4bzBC9MR8Kgw28NIdDh85YleNExIXYZ+/bjJxxdd9+ysPddxvyxZ+a1RCJow89poz8eY38gop+9L9pu0R8Qiz6bgXm9TBbTG2XsmzcXBu+PhYY/BPCeo9t6oRfvjwDz2vb9kRCJ5eGYMB0J6eYthX1/esQG8H1hPkzD970vpDcN6w3rA/uFdH/MH037vrfmp33WuO1g26WXpfM54qcjBbIWMbNvAN8A6NevX5yrETm6mBl+I+a+q/6Dri8ix47YsN8wTrch8Du85/C+HveGPwRw7PsDgki4buixj/3DomH/Dkc4HGlvEHuEomG+ubbIfOOjI9H1o/uymCUNy+wzDdFpDR0pkG0F+sbM9/HaGnHOPQA8AJHrkLVPaSIiIse2/f9gk9bUkUa7vQ8MNrOBZpYEXAI8G+eaRERERNpch+khc84FzezbwMtEjpP81Tn3UZzLEhEREWlzHSaQATjnXgRejHcdIiIiIu2pIx2yFBERETkmKZCJiIiIxJkCmYiIiEicKZCJiIiIxJkCmYiIiEicKZCJiIiIxJkCmYiIiEicmXOd9+5DZlYMfBLvOuSQ8oCSeBchh6TvqfPQd9V56LvqPNrju+rvnMtvbkGnDmTSOZjZIufcuHjXIQen76nz0HfVeei76jzi/V3pkKWIiIhInCmQiYiIiMSZApm0hwfiXYC0iL6nzkPfVeeh76rziOt3pTFkIiIiInGmHjIRERGROFMgk8/EzPqa2VwzW2lmH5nZ97z2XDP7j5mt9Z5zvHYzs3vNbJ2ZLTezovi+g2OLmfnN7AMze96bH2hm73nfx+NmluS1J3vz67zlA+Ja+DHGzLqY2ZNmtsrMPjazifqd6pjM7Frv374VZvaomaXo96pjMLO/mtkuM1sR03bYv0dmNstbf62ZzWqrehXI5LMKAj90zg0HTgKuMbPhwI3Aa865wcBr3jzAWcBg7/EN4H/bv+Rj2veAj2Pm7wTuds4dD+wBvua1fw3Y47Xf7a0n7ece4CXn3AnAaCLfmX6nOhgz6w18FxjnnBsJ+IFL0O9VRzEHOLNJ22H9HplZLnArcCIwAbi1IcS1NgUy+Uycc9udc0u86Qoi/3H0Bs4DHvRWexA435s+D/i7i1gAdDGznu1b9bHJzPoAZwN/9uYNmA486a3S9Htq+P6eBGZ460sbM7NsYDLwFwDnXMA5txf9TnVUCUCqmSUAacB29HvVITjn3gJKmzQf7u/RGcB/nHOlzrk9wH/YP+S1CgUyaTVe9/sY4D2gu3Nuu7doB9Ddm+4NbI7ZbIvXJm3vt8D1QNib7wrsdc4FvfnY7yL6PXnLy7z1pe0NBIqBv3mHl/9sZunod6rDcc5tBe4CPiUSxMqAxej3qiM73N+jdvv9UiCTVmFmGcC/gO8758pjl7nIqbw6nTeOzOwcYJdzbnG8a5FDSgCKgP91zo0Bqth3WAXQ71RH4R26Oo9IiO4FpNNGvSfS+jra75ECmXxmZpZIJIw97Jx7ymve2XDYxHve5bVvBfrGbN7Ha5O2dQrweTPbBDxG5JDKPUS65RO8dWK/i+j35C3PBna3Z8HHsC3AFufce978k0QCmn6nOp6ZwEbnXLFzrh54isjvmn6vOq7D/T1qt98vBTL5TLzxD38BPnbO/U/MomeBhrNRZgHPxLR/xTuj5SSgLKb7WNqIc+4m51wf59wAIoOOX3fOXQ7MBS70Vmv6PTV8fxd663eYvySPZs65HcBmMxvqNc0AVqLfqY7oU+AkM0vz/i1s+K70e9VxHe7v0cvA6WaW4/WInu61tTpdGFY+EzObBMwDPmTf2KSbiYwj+yfQD/gE+JJzrtT7R+t3RLr1q4GvOucWtXvhxzAzmwpc55w7x8wGEekxywU+AK5wztWZWQrwEJExgaXAJc65DXEq+ZhjZoVETr5IAjYAXyXyB7R+pzoYM/sZcDGRM84/AK4iMsZIv1dxZmaPAlOBPGAnkbMl/4/D/D0ysyuJ/L8GcLtz7m9tUq8CmYiIiEh86ZCliIiISJwpkImIiIjEmQKZiIiISJwpkImIiIjEmQKZiIiISJwpkIlIuzEzZ2a/iZm/zsx+2kr7nmNmFx56zc/8OheZ2cdmNrdJu8/M7jWzFWb2oZm9b2YDW+H1BpjZis+6HxHp2BTIRKQ91QFfNLO8eBcSK+aq6i3xNeDrzrlpTdovJnL7nALn3CjgC8De1qlQRI52CmQi0p6CwAPAtU0XNO3hMrNK73mqmb1pZs+Y2QYzu8PMLjezhV5P1HExu5lpZovMbI13/07MzG9mv/Z6rJab2dUx+51nZs8Subp603ou9fa/wszu9Np+AkwC/mJmv26ySU9gu3MuDOCc2+Kc2+Ntd7qZzTezJWb2hHfvV8xsrPfeFpvZyzG3dBlrZsvMbBlwTUxNI7z3vdR7L4MP69MXkQ5LgUxE2tvvgcvNLPswthkNfBMYBnwZGOKcm0DkavbfiVlvADABOBu437sy+teI3AZlPDAe+HrMocQi4HvOuSGxL2ZmvYA7idzzsxAYb2bnO+duAxYBlzvnftSkxn8C53ph6TdmNsbbVx7wX8BM51yRt/0PLHIP2PuAC51zY4G/Ard7+/ob8B3n3Ogmr/FN4B7nXCEwjsh9L0XkKHA43fQiIp+Zc67czP4OfBeoaeFm7zfcn9HM1gOveO0fArGHDv/p9VCtNbMNwAlE7j1XENP7lg0MBgLAQufcxmZebzzwhnOu2HvNh4HJRG67cqD3tcW7/+R07/GamV0EpALDgXcid2chCZgPDAVGAv/x2v3AdjPrAnRxzr3l7foh4Cxvej7wYzPrAzzlnFt7wE9MRDoVBTIRiYffAkuI9AQ1COL12puZj0hwaVAXMx2OmQ/T+N+xpveCc4AR6W1qdENg756eVUdS/IE45+qAfwP/NrOdwPlEwuN/nHOXNnn9UcBHzrmJTdq7HGT/j5jZe0R6AF80s6udc6+35nsQkfjQIUsRaXfOuVIih/i+FtO8CRjrTX8eSDyCXV/kne14HDAIWA28DHzLO0SImQ0xs/RD7GchMMXM8szMD1wKvHmwDcysyDvU2RAoC4jcvHgBcIqZHe8tSzezIV5t+WY20WtPNLMRzrm9wF4zm+Tt+vKY1xgEbHDO3Qs8472GiBwFFMhEJF5+A8SebfknIiFoGTCRI+u9+pRImPo38E3nXC2RcWYrgSXe5SP+yCGODniHR28E5gLLgMXOuWcO8drdgOe811hOpMfvd95hz9nAo2a2nMhhxxOccwHgQuBO7z0vBU729vVV4PdmtpRID1+DLwErvPaRwN8PUZOIdBLmXNMefhERERFpT+ohExEREYkzBTIRERGROFMgExEREYkzBTIRERGROFMgExEREYkzBTIRERGROFMgExEREYkzBTIRERGROPv/4A3x8Y6Wx8YAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "seed_idx = list(range (50, max_seeds +1, 50))\n",
+    "\n",
+    "plt.figure(figsize=(10,5))\n",
+    "\n",
+    "for i in range(len(data)):\n",
+    "    plt.plot(seed_idx, perf[i], label = names[i] )\n",
+    "\n",
+    "plt.title('Speedup vs. Number of Seeds')\n",
+    "plt.xlabel('Number of Seeds')\n",
+    "plt.ylabel('Speedup')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3786"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del time_algo_cu\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----\n",
+    "Copyright (c) 2021, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_dev",
+   "language": "python",
+   "name": "cugraph_dev"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/cugraph_benchmarks/release.ipynb b/notebooks/cugraph_benchmarks/release.ipynb
index d3110da3621..a6eeeb65cdf 100644
--- a/notebooks/cugraph_benchmarks/release.ipynb
+++ b/notebooks/cugraph_benchmarks/release.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Release Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "With every release, RAPIDS publishes a release slide deck that includes the current performance state of cuGraph. \n",
     "This notebook, starting with release 0.15, runs all the various algorithms to computes the performance gain.  \n",
@@ -21,6 +22,7 @@
     "| Triangle Counting       |    X     |             |\n",
     "\n",
     "### Test Data\n",
+    "Users must run the _dataPrep.sh_ script before running this notebook so that the test files are downloaded\n",
     "\n",
     "| File Name              | Num of Vertices | Num of Edges |\n",
     "| ---------------------- | --------------: | -----------: |\n",
@@ -593,7 +595,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
index 2d040e0acaf..32b562e7a1e 100644
--- a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# SSSP Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running SSSP within cuGraph against NetworkX. \n",
     "\n",
diff --git a/notebooks/demo/batch_betweenness.ipynb b/notebooks/demo/batch_betweenness.ipynb
index e2ad83ff1c4..885d26c9523 100644
--- a/notebooks/demo/batch_betweenness.ipynb
+++ b/notebooks/demo/batch_betweenness.ipynb
@@ -138,7 +138,7 @@
    "outputs": [],
    "source": [
     "t_start_read_sg = time.perf_counter()\n",
-    "e_list = cudf.read_csv(input_data_path, delimiter='\\t', names=['src', 'dst'], dtype=['int32', 'int32'])\n",
+    "e_list = cudf.read_csv(input_data_path, delimiter='\\t', names=['src', 'dst'], dtype=['int32', 'int32'], comment='#')\n",
     "t_stop_read_sg = time.perf_counter()"
    ]
   },
diff --git a/notebooks/demo/uvm.ipynb b/notebooks/demo/uvm.ipynb
index d279be8ed54..8fa2b08b6d1 100644
--- a/notebooks/demo/uvm.ipynb
+++ b/notebooks/demo/uvm.ipynb
@@ -6,6 +6,7 @@
    "source": [
     "# Oversubscribing GPU memory in cuGraph\n",
     "#### Author : Alex Fender\n",
+    "# Skip notebook test\n",
     "\n",
     "In this notebook, we will show how to **scale to 4x larger graphs than before** without incurring a performance drop using managed memory features in cuGraph. We will compute the PageRank of each user in Twitter's dataset on a single GPU as an example. This technique applies to all features.\n",
     "\n",
diff --git a/notebooks/layout/Force-Atlas2.ipynb b/notebooks/layout/Force-Atlas2.ipynb
index fa9ec0fd180..456af3c62de 100644
--- a/notebooks/layout/Force-Atlas2.ipynb
+++ b/notebooks/layout/Force-Atlas2.ipynb
@@ -4,7 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Force Atlas 2"
+    "# Force Atlas 2\n",
+    "# Skip notebook test"
    ]
   },
   {
@@ -521,4 +522,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/notebooks/link_analysis/Pagerank.ipynb b/notebooks/link_analysis/Pagerank.ipynb
index c43561ff48c..a81e1ccf6c3 100755
--- a/notebooks/link_analysis/Pagerank.ipynb
+++ b/notebooks/link_analysis/Pagerank.ipynb
@@ -11,7 +11,7 @@
     "Notebook Credits\n",
     "* Original Authors: Bradley Rees and James Wyles\n",
     "* Created:   08/13/2019\n",
-    "* Updated:   08/16/2020\n",
+    "* Updated:   01/17/2021\n",
     "\n",
     "RAPIDS Versions: 0.14    \n",
     "\n",
@@ -190,7 +190,7 @@
    "metadata": {},
    "source": [
     "### Read in the data - GPU\n",
-    "cuGraph depends on cuDF for data loading and the initial Dataframe creation\n",
+    "cuGraph graphs can be created from cuDF, dask_cuDF and Pandas dataframes\n",
     "\n",
     "The data file contains an edge list, which represents the connection of a vertex to another.  The `source` to `destination` pairs is in what is known as Coordinate Format (COO).  In this test case, the data is just two columns.  However a third, `weight`, column is also possible"
    ]
@@ -219,8 +219,7 @@
    "outputs": [],
    "source": [
     "# create a Graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
-    "G = cugraph.Graph()\n",
-    "G.from_cudf_edgelist(gdf, source='src', destination='dst')"
+    "G = cugraph.from_edgelist(gdf, source='src', destination='dst')"
    ]
   },
   {
diff --git a/notebooks/link_prediction/Jaccard-Similarity.ipynb b/notebooks/link_prediction/Jaccard-Similarity.ipynb
index 21835da1cce..7003bdbc98e 100755
--- a/notebooks/link_prediction/Jaccard-Similarity.ipynb
+++ b/notebooks/link_prediction/Jaccard-Similarity.ipynb
@@ -451,8 +451,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "pr_df.rename(columns={'pagerank': 'weight'}, inplace=True)",
     "# Call weighted Jaccard using the Pagerank scores as weights:\n",
-    "wdf = cugraph.jaccard_w(G, pr_df['pagerank'])"
+    "wdf = cugraph.jaccard_w(G, pr_df)"
    ]
   },
   {
diff --git a/notebooks/sampling/RandomWalk.ipynb b/notebooks/sampling/RandomWalk.ipynb
new file mode 100644
index 00000000000..84f8e1db07f
--- /dev/null
+++ b/notebooks/sampling/RandomWalk.ipynb
@@ -0,0 +1,195 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Random Walk Sampling\n",
+    "\n",
+    "In this notebook, we will compute the Random Walk from a set of seeds using cuGraph.  \n",
+    "\n",
+    "\n",
+    "| Author Credit |    Date    |  Update      | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|--------------|-----------------|----------------|\n",
+    "| Brad Rees     | 04/20/2021 | created      | 0.19            | GV100, CUDA 11.0\n",
+    "\n",
+    "Currently NetworkX does not have a random walk function.  There is code on StackOverflow that generats a random walk by getting a vertice and then randomly selection a neighbor and then repeating the process.  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test Data\n",
+    "We will be using the Zachary Karate club dataset \n",
+    "*W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of\n",
+    "Anthropological Research 33, 452-473 (1977).*\n",
+    "\n",
+    "\n",
+    "![Karate Club](../img/zachary_black_lines.png)\n",
+    "\n",
+    "\n",
+    "Because the test data has vertex IDs starting at 1, the auto-renumber feature of cuGraph (mentioned above) will be used so the starting vertex ID is zero for maximum efficiency. The resulting data will then be auto-unrenumbered, making the entire renumbering process transparent to users."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  Import the modules\n",
+    "import cugraph\n",
+    "import cudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read The Data\n",
+    "# Define the path to the test data  \n",
+    "datafile='../data/karate-data.csv'\n",
+    "\n",
+    "gdf = cudf.read_csv(datafile, delimiter='\\t', names=['src', 'dst'], dtype=['int32', 'int32'] )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gdf['wt'] = 1.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a Graph - using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
+    "G = cugraph.Graph()\n",
+    "G.from_cudf_edgelist(gdf, source='src', destination='dst', edge_attr='wt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# some stats on the graph\n",
+    "(G.number_of_nodes(), G.number_of_edges() )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a list with the seeds\n",
+    "seeds = [17,19]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# random walk path length\n",
+    "path_length = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rw, so, sz = cugraph.random_walks(G, seeds, path_length, use_padding=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A random walk generates a path from a seed vertex. At each step on the random walk (starting from the seed), the random walker picks a random departing edge to traverse. The random walk will terminate in two situations, when the maximum path length is reached, or when the current vertex on the path has no departing edges to traverse. The result of a single random walk will be a path of some length less than or equal to the maximum path length.\n",
+    "\n",
+    "cugraph.random_walks performs a random walk from each of the specified seeds. The output will be a path for each of the seeds. Because the path lengths might be variable length, the return value consists of a pair of outputs.\n",
+    "\n",
+    "The first output provides the edges used on the paths.\n",
+    "\n",
+    "The second output represents the seed offset, which is a cuDF Series. The seed offset identifies the offset of the first entry in the first output for a particular seed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rw.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = 0\n",
+    "for i in range(len(seeds)):\n",
+    "    for j in range(path_length):\n",
+    "        print(f\"{rw[idx]}\", end=\" \")\n",
+    "        idx += 1\n",
+    "    print(\" \")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----\n",
+    "Copyright (c) 2021, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_dev",
+   "language": "python",
+   "name": "cugraph_dev"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py
index d752c868237..9421bee6869 100644
--- a/python/cugraph/__init__.py
+++ b/python/cugraph/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,11 +24,17 @@
     analyzeClustering_ratio_cut,
     subgraph,
     triangles,
+    ego_graph,
+    batched_ego_graphs,
 )
 
 from cugraph.structure import (
     Graph,
     DiGraph,
+    MultiGraph,
+    MultiDiGraph,
+    BiPartiteGraph,
+    BiPartiteDiGraph,
     from_edgelist,
     from_cudf_edgelist,
     from_pandas_edgelist,
@@ -44,7 +50,11 @@
     symmetrize,
     symmetrize_df,
     symmetrize_ddf,
-)
+    is_weighted,
+    is_directed,
+    is_multigraph,
+    is_bipartite,
+    is_multipartite)
 
 from cugraph.centrality import (
     betweenness_centrality,
@@ -77,23 +87,26 @@
     sssp,
     shortest_path,
     filter_unreachable,
-    shortest_path_length
+    shortest_path_length,
+    traveling_salesperson,
+    concurrent_bfs,
+    multi_source_bfs,
 )
 
 from cugraph.tree import minimum_spanning_tree, maximum_spanning_tree
 
 from cugraph.utilities import utils
 
-from cugraph.bsp.traversal import bfs_df_pregel
-
 from cugraph.proto.components import strong_connected_component
 from cugraph.proto.structure import find_bicliques
 
-from cugraph.linear_assignment import hungarian
+from cugraph.linear_assignment import hungarian, dense_hungarian
 from cugraph.layout import force_atlas2
 from cugraph.raft import raft_include_test
 from cugraph.comms import comms
 
+from cugraph.sampling import random_walks, rw_path
+
 # Versioneer
 from ._version import get_versions
 
diff --git a/python/cugraph/bsp/traversal/bfs_bsp.py b/python/cugraph/bsp/traversal/bfs_bsp.py
deleted file mode 100644
index 28a71631443..00000000000
--- a/python/cugraph/bsp/traversal/bfs_bsp.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import cudf
-from collections import OrderedDict
-
-
-def bfs_df_pregel(_df, start, src_col='src', dst_col='dst', copy_data=True):
-    """
-    This function executes an unwieghted Breadth-First-Search (BFS) traversal
-    to find the distances and predecessors from a specified starting vertex
-
-    NOTE: Only reachable vertices are returned
-    NOTE: data is not sorted
-
-    Parameters
-    ----------
-    _df : cudf.dataframe
-        a dataframe containing the source and destination edge list
-
-    start : same type as 'src' and 'dst'
-        The index of the graph vertex from which the traversal begins
-
-    src : string
-        the source column name
-
-    dst : string
-        the destination column name
-
-    copy_data : Bool
-        whether we can manipulate the dataframe or if a copy should be made
-
-
-    Returns
-    -------
-    df : cudf.DataFrame
-        df['vertex'][i] gives the vertex id of the i'th vertex
-        df['distance'][i] gives the path distance for the i'th vertex
-            from the starting vertex
-        df['predecessor'][i] gives for the i'th vertex the vertex it was
-        reached from in the traversal
-
-    Examples
-    --------
-    >>> data_df =
-          cudf.read_csv('datasets/karate.csv', delimiter=' ', header=None)
-    >>> df = cugraph.pregel_bfs(data_df, 1, '0', '1')
-
-    """
-
-    # extract the src and dst into a dataframe that can be modified
-    if copy_data:
-        coo_data = _df[[src_col, dst_col]]
-    else:
-        coo_data = _df
-
-    coo_data.rename(columns={src_col: 'src', dst_col: 'dst'}, inplace=True)
-
-    # convert the "start" vertex into a series
-    frontier = cudf.Series(start).to_frame('dst')
-
-    # create the answer DF
-    answer = cudf.DataFrame()
-    answer['vertex'] = start
-    answer['distance'] = 0
-    answer['predecessor'] = -1
-
-    # init some variables
-    distance = 0
-    done = False
-
-    while not done:
-
-        # ---------------------------------
-        # update the distance and add it to the dataframe
-        distance = distance + 1
-        frontier['distance'] = distance
-
-        # -----------------------------------
-        # Removed all instances of the frontier vertices from 'dst' side
-        # we do not want to hop to a vertex that has already been seen
-        coo_data = coo_data.merge(frontier, on=['dst'], how='left')
-        coo_data = coo_data[coo_data.distance.isnull()]
-        coo_data.drop('distance', inplace=True)
-
-        # now update column names for finding source vertices
-        frontier.rename(columns={'dst': 'src'}, inplace=True)
-
-        # ---------------------------------
-        # merge the list of vertices and distances with the COO list
-        # there are two sets of results that we get from the "hop_df" merge
-        # (A) the set of edges that start with a vertice in the frontier set
-        #     - this goes into the answer set
-        #     - this also forms the next frontier set
-        # (B) the set of edges that did not start with a frontier vertex
-        #     - this form the new set of coo_data
-        hop_df = coo_data.merge(frontier, on=['src'], how='left')
-
-        # ---------------------------------
-        # (A) get the data where the 'src' was in the frontier list
-        # create a new dataframe of vertices to hop out from (the 'dst')
-        one_hop = hop_df.query("distance == @distance")
-        frontier = one_hop['dst'].to_frame('dst')
-
-        # ---------------------------------
-        # (B) get all the edges that where not touched
-        coo_data = hop_df[hop_df.distance.isnull()]
-        coo_data.drop('distance', inplace=True)
-
-        # ---------------------------------
-        # update the answer
-        one_hop.rename(
-            columns={'dst': 'vertex', 'src': 'predecessor'}, inplace=True)
-
-        # remote duplicates. smallest vertex wins
-        aggsOut = OrderedDict()
-        aggsOut['predecessor'] = 'min'
-        aggsOut['distance'] = 'min'
-        _a = one_hop.groupby(['vertex'], as_index=False).agg(aggsOut)
-
-        answer = cudf.concat([answer, _a])
-
-        if len(coo_data) == 0:
-            done = True
-
-        if not done and len(frontier) == 0:
-            done = True
-
-    # all done, return the answer
-    return answer
diff --git a/python/cugraph/centrality/__init__.py b/python/cugraph/centrality/__init__.py
index da882a61850..f33df2fe61a 100644
--- a/python/cugraph/centrality/__init__.py
+++ b/python/cugraph/centrality/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.centrality.katz_centrality import katz_centrality
-from cugraph.centrality.betweenness_centrality import betweenness_centrality
 from cugraph.centrality.betweenness_centrality import (
+    betweenness_centrality,
     edge_betweenness_centrality,
 )
diff --git a/python/cugraph/centrality/betweenness_centrality.pxd b/python/cugraph/centrality/betweenness_centrality.pxd
index 829d7be37d9..7abc9009cc8 100644
--- a/python/cugraph/centrality/betweenness_centrality.pxd
+++ b/python/cugraph/centrality/betweenness_centrality.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef void betweenness_centrality[VT, ET, WT, result_t](
         const handle_t &handle,
diff --git a/python/cugraph/centrality/betweenness_centrality.py b/python/cugraph/centrality/betweenness_centrality.py
index 634cc2aa7a2..3b7cfe6b68f 100644
--- a/python/cugraph/centrality/betweenness_centrality.py
+++ b/python/cugraph/centrality/betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,8 +36,10 @@ def betweenness_centrality(
     Betweenness centrality is a measure of the number of shortest paths that
     pass through a vertex.  A vertex with a high betweenness centrality score
     has more paths passing through it and is therefore believed to be more
-    important. Rather than doing an all-pair shortest path, a sample of k
-    starting vertices can be used.
+    important.
+
+    To improve performance. rather than doing an all-pair shortest path,
+    a sample of k starting vertices can be used.
 
     CuGraph does not currently support the 'endpoints' and 'weight' parameters
     as seen in the corresponding networkX call.
@@ -52,19 +54,18 @@ def betweenness_centrality(
 
     k : int or list or None, optional, default=None
         If k is not None, use k node samples to estimate betweenness.  Higher
-        values give better approximation
-        If k is a list, use the content of the list for estimation: the list
-        should contain vertices identifiers.
-        If k is None (the default), all the vertices are used to estimate
-        betweenness.
-        Vertices obtained through sampling or defined as a list will be used as
-        sources for traversals inside the algorithm.
+        values give better approximation.  If k is a list, use the content
+        of the list for estimation: the list should contain vertex
+        identifiers. If k is None (the default), all the vertices are used
+        to estimate betweenness.  Vertices obtained through sampling or
+        defined as a list will be used assources for traversals inside the
+        algorithm.
 
     normalized : bool, optional
         Default is True.
         If true, the betweenness values are normalized by
-        2 / ((n - 1) * (n - 2)) for Graphs (undirected), and
-        1 / ((n - 1) * (n - 2)) for DiGraphs (directed graphs)
+        __2 / ((n - 1) * (n - 2))__ for Graphs (undirected), and
+        __1 / ((n - 1) * (n - 2))__ for DiGraphs (directed graphs)
         where n is the number of nodes in G.
         Normalization will ensure that values are in [0, 1],
         this normalization scales for the highest possible value where one
@@ -145,15 +146,22 @@ def betweenness_centrality(
 
 
 def edge_betweenness_centrality(
-    G, k=None, normalized=True, weight=None, seed=None, result_dtype=np.float64
+    G,
+    k=None,
+    normalized=True,
+    weight=None,
+    seed=None,
+    result_dtype=np.float64
 ):
     """
     Compute the edge betweenness centrality for all edges of the graph G.
     Betweenness centrality is a measure of the number of shortest paths
     that pass over an edge.  An edge with a high betweenness centrality
     score has more paths passing over it and is therefore believed to be
-    more important. Rather than doing an all-pair shortest path, a sample
-    of k starting vertices can be used.
+    more important.
+
+    To improve performance, rather than doing an all-pair shortest path,
+    a sample of k starting vertices can be used.
 
     CuGraph does not currently support the 'weight' parameter
     as seen in the corresponding networkX call.
@@ -168,7 +176,7 @@ def edge_betweenness_centrality(
 
     k : int or list or None, optional, default=None
         If k is not None, use k node samples to estimate betweenness.  Higher
-        values give better approximation
+        values give better approximation.
         If k is a list, use the content of the list for estimation: the list
         should contain vertices identifiers.
         Vertices obtained through sampling or defined as a list will be used as
@@ -233,7 +241,6 @@ def edge_betweenness_centrality(
     >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
     >>> ebc = cugraph.edge_betweenness_centrality(G)
     """
-
     if weight is not None:
         raise NotImplementedError(
             "weighted implementation of betweenness "
@@ -254,8 +261,16 @@ def edge_betweenness_centrality(
         df = G.unrenumber(df, "dst")
 
     if type(G) is cugraph.Graph:
+        # select the lower triangle of the df based on src/dst vertex value
         lower_triangle = df['src'] >= df['dst']
-        df[["src", "dst"]][lower_triangle] = df[["dst", "src"]][lower_triangle]
+        # swap the src and dst vertices for the lower triangle only. Because
+        # this is a symmeterized graph, this operation results in a df with
+        # multiple src/dst entries.
+        df['src'][lower_triangle], df['dst'][lower_triangle] = \
+            df['dst'][lower_triangle], df['src'][lower_triangle]
+        # overwrite the df with the sum of the values for all alike src/dst
+        # vertex pairs, resulting in half the edges of the original df from the
+        # symmeterized graph.
         df = df.groupby(by=["src", "dst"]).sum().reset_index()
 
     if isNx is True:
diff --git a/python/cugraph/centrality/betweenness_centrality_wrapper.pyx b/python/cugraph/centrality/betweenness_centrality_wrapper.pyx
index e3d6e04006f..e63b6996816 100644
--- a/python/cugraph/centrality/betweenness_centrality_wrapper.pyx
+++ b/python/cugraph/centrality/betweenness_centrality_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,18 +17,12 @@
 # cython: language_level = 3
 
 from cugraph.centrality.betweenness_centrality cimport betweenness_centrality as c_betweenness_centrality
-from cugraph.centrality.betweenness_centrality cimport handle_t
-from cugraph.structure.graph import DiGraph
+from cugraph.structure.graph_classes import DiGraph
 from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 import cudf
 import numpy as np
-import numpy.ctypeslib as ctypeslib
-
-import dask_cudf
-import dask_cuda
-
 import cugraph.comms.comms as Comms
 from cugraph.dask.common.mg_utils import get_client
 import dask.distributed
diff --git a/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx b/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx
index 3c14d590750..095d291c45e 100644
--- a/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx
+++ b/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,14 +18,12 @@
 
 from cugraph.centrality.betweenness_centrality cimport edge_betweenness_centrality as c_edge_betweenness_centrality
 from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.graph import DiGraph, Graph
+from cugraph.structure.graph_classes import DiGraph, Graph
 from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 import cudf
 import numpy as np
-import numpy.ctypeslib as ctypeslib
-
 from cugraph.dask.common.mg_utils import get_client
 import cugraph.comms.comms as Comms
 import dask.distributed
diff --git a/python/cugraph/centrality/katz_centrality.pxd b/python/cugraph/centrality/katz_centrality.pxd
index ebf94c78263..c48a90904da 100644
--- a/python/cugraph/centrality/katz_centrality.pxd
+++ b/python/cugraph/centrality/katz_centrality.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,10 +16,10 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_katz_centrality[VT,WT](
         const handle_t &handle,
diff --git a/python/cugraph/centrality/katz_centrality.py b/python/cugraph/centrality/katz_centrality.py
index 3e2680a196f..a1e7c1b2349 100644
--- a/python/cugraph/centrality/katz_centrality.py
+++ b/python/cugraph/centrality/katz_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -39,14 +39,16 @@ def katz_centrality(
         Attenuation factor defaulted to None. If alpha is not specified then
         it is internally calculated as 1/(degree_max) where degree_max is the
         maximum out degree.
-        NOTE : The maximum acceptable value of alpha for convergence
-        alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue
-        of the graph.
-        Since lambda_max is always lesser than or equal to degree_max for a
-        graph, alpha_max will always be greater than or equal to
-        (1/degree_max). Therefore, setting alpha to (1/degree_max) will
-        guarantee that it will never exceed alpha_max thus in turn fulfilling
-        the requirement for convergence.
+
+        NOTE
+            The maximum acceptable value of alpha for convergence
+            alpha_max = 1/(lambda_max) where lambda_max is the largest
+            eigenvalue of the graph.
+            Since lambda_max is always lesser than or equal to degree_max for a
+            graph, alpha_max will always be greater than or equal to
+            (1/degree_max). Therefore, setting alpha to (1/degree_max) will
+            guarantee that it will never exceed alpha_max thus in turn
+            fulfilling the requirement for convergence.
     beta : None
         A weight scalar - currently Not Supported
     max_iter : int
@@ -104,7 +106,11 @@ def katz_centrality(
 
     if nstart is not None:
         if G.renumbered is True:
-            nstart = G.add_internal_vertex_id(nstart, 'vertex', 'vertex')
+            if len(G.renumber_map.implementation.col_names) > 1:
+                cols = nstart.columns[:-1].to_list()
+            else:
+                cols = 'vertex'
+            nstart = G.add_internal_vertex_id(nstart, 'vertex', cols)
 
     df = katz_centrality_wrapper.katz_centrality(
         G, alpha, max_iter, tol, nstart, normalized
diff --git a/python/cugraph/centrality/katz_centrality_wrapper.pyx b/python/cugraph/centrality/katz_centrality_wrapper.pyx
index 088042395fd..038723ad9bf 100644
--- a/python/cugraph/centrality/katz_centrality_wrapper.pyx
+++ b/python/cugraph/centrality/katz_centrality_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,13 +17,10 @@
 # cython: language_level = 3
 
 from cugraph.centrality.katz_centrality cimport call_katz_centrality
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from cugraph.structure import graph_primtypes_wrapper
-from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-
 import cudf
-import rmm
 import numpy as np
 
 
@@ -37,7 +34,7 @@ def get_output_df(input_graph, nstart):
         if len(nstart) != num_verts:
             raise ValueError('nstart must have initial guess for all vertices')
 
-        nstart['values'] = graph_primtypes_wrapper.datatype_cast([nstart['values']], [np.float64])
+        nstart['values'] = graph_primtypes_wrapper.datatype_cast([nstart['values']], [np.float64])[0]
         df['katz_centrality'][nstart['vertex']] = nstart['values']
 
     return df
diff --git a/python/cugraph/comms/comms.pxd b/python/cugraph/comms/comms.pxd
index 44f7ee77562..5bc24c0d639 100644
--- a/python/cugraph/comms/comms.pxd
+++ b/python/cugraph/comms/comms.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,10 +16,10 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_primtypes cimport handle_t
+from cugraph.raft.common.handle cimport *
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
    cdef void init_subcomms(handle_t &handle,
                            size_t row_comm_size)
diff --git a/python/cugraph/comms/comms.py b/python/cugraph/comms/comms.py
index 925f4a1a060..85fc426f373 100644
--- a/python/cugraph/comms/comms.py
+++ b/python/cugraph/comms/comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.raft.dask.common.comms import Comms as raftComms
-from cugraph.raft.dask.common.comms import worker_state
+from cugraph.raft.dask.common.comms import get_raft_comm_state
 from cugraph.raft.common.handle import Handle
 from cugraph.comms.comms_wrapper import init_subcomms as c_init_subcomms
 from dask.distributed import default_client
@@ -196,12 +196,12 @@ def get_default_handle():
 # Functions to be called from within workers
 
 def get_handle(sID):
-    sessionstate = worker_state(sID)
+    sessionstate = get_raft_comm_state(sID)
     return sessionstate['handle']
 
 
 def get_worker_id(sID):
-    sessionstate = worker_state(sID)
+    sessionstate = get_raft_comm_state(sID)
     return sessionstate['wid']
 
 
@@ -216,5 +216,5 @@ def get_n_workers(sID=None):
     if sID is None:
         return read_utils.get_n_workers()
     else:
-        sessionstate = worker_state(sID)
+        sessionstate = get_raft_comm_state(sID)
         return sessionstate['nworkers']
diff --git a/python/cugraph/comms/comms_wrapper.pyx b/python/cugraph/comms/comms_wrapper.pyx
index c1148b4c887..09fa3b1c5c7 100644
--- a/python/cugraph/comms/comms_wrapper.pyx
+++ b/python/cugraph/comms/comms_wrapper.pyx
@@ -1,5 +1,23 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-from cugraph.structure.graph_primtypes cimport handle_t
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+
+from cugraph.raft.common.handle cimport *
 from cugraph.comms.comms cimport init_subcomms as c_init_subcomms
 
 
diff --git a/python/cugraph/community/__init__.py b/python/cugraph/community/__init__.py
index d3bb6472894..9cc92637e20 100644
--- a/python/cugraph/community/__init__.py
+++ b/python/cugraph/community/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,3 +25,5 @@
 from cugraph.community.triangle_count import triangles
 from cugraph.community.ktruss_subgraph import ktruss_subgraph
 from cugraph.community.ktruss_subgraph import k_truss
+from cugraph.community.egonet import ego_graph
+from cugraph.community.egonet import batched_ego_graphs
diff --git a/python/cugraph/community/ecg.pxd b/python/cugraph/community/ecg.pxd
index 9f1dc269b6f..4f13237eac7 100644
--- a/python/cugraph/community/ecg.pxd
+++ b/python/cugraph/community/ecg.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef void ecg[VT,ET,WT](
         const handle_t &handle,
diff --git a/python/cugraph/community/egonet.pxd b/python/cugraph/community/egonet.pxd
new file mode 100644
index 00000000000..acf93330447
--- /dev/null
+++ b/python/cugraph/community/egonet.pxd
@@ -0,0 +1,23 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cugraph.structure.graph_utilities cimport *
+
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+    cdef unique_ptr[cy_multi_edgelists_t] call_egonet[vertex_t, weight_t](
+        const handle_t &handle,
+        const graph_container_t &g,
+        vertex_t* source_vertex,
+        vertex_t n_subgraphs,
+        vertex_t radius) except +
diff --git a/python/cugraph/community/egonet.py b/python/cugraph/community/egonet.py
new file mode 100644
index 00000000000..5ae025f1203
--- /dev/null
+++ b/python/cugraph/community/egonet.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.community import egonet_wrapper
+import cudf
+from cugraph.utilities import (
+    ensure_cugraph_obj,
+    import_optional,
+)
+from cugraph.utilities import cugraph_to_nx
+
+# optional dependencies used for handling different input types
+nx = import_optional("networkx")
+
+
+def _convert_graph_to_output_type(G, input_type):
+    """
+    Given a cugraph.Graph, convert it to a new type appropriate for the
+    graph algos in this module, based on input_type.
+    """
+    if (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
+        return cugraph_to_nx(G)
+
+    else:
+        return G
+
+
+def _convert_df_series_to_output_type(df, offsets, input_type):
+    """
+    Given a cudf.DataFrame df, convert it to a new type appropriate for the
+    graph algos in this module, based on input_type.
+    """
+    if (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
+        return df.to_pandas(), offsets.values_host.tolist()
+
+    else:
+        return df, offsets
+
+
+def ego_graph(G, n, radius=1, center=True, undirected=False, distance=None):
+    """
+    Compute the  induced subgraph of neighbors centered at node n,
+    within a given radius.
+
+    Parameters
+    ----------
+    G : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix
+        Graph or matrix object, which should contain the connectivity
+        information. Edge weights, if present, should be single or double
+        precision floating point values.
+    n : integer or cudf.DataFrame
+        A single node as integer or a cudf.DataFrame if nodes are
+        represented with multiple columns. If a cudf.DataFrame is provided,
+        only the first row is taken as the node input.
+    radius: integer, optional
+        Include all neighbors of distance<=radius from n.
+    center: bool, optional
+        Defaults to True. False is not supported
+    undirected: bool, optional
+        Defaults to False. True is not supported
+    distance: key, optional
+        Distances are counted in hops from n. Other cases are not supported.
+
+    Returns
+    -------
+    G_ego : cuGraph.Graph or networkx.Graph
+        A graph descriptor with a minimum spanning tree or forest.
+        The networkx graph will not have all attributes copied over
+
+    Examples
+    --------
+    >>> M = cudf.read_csv('datasets/karate.csv',
+                          delimiter = ' ',
+                          dtype=['int32', 'int32', 'float32'],
+                          header=None)
+    >>> G = cugraph.Graph()
+    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> ego_graph = cugraph.ego_graph(G, seed, radius=2)
+
+    """
+
+    (G, input_type) = ensure_cugraph_obj(G, nx_weight_attr="weight")
+    result_graph = type(G)()
+
+    if G.renumbered is True:
+        if isinstance(n, cudf.DataFrame):
+            n = G.lookup_internal_vertex_id(n, n.columns)
+        else:
+            n = G.lookup_internal_vertex_id(cudf.Series([n]))
+
+    df, offsets = egonet_wrapper.egonet(G, n, radius)
+
+    if G.renumbered:
+        df, src_names = G.unrenumber(df, "src", get_column_names=True)
+        df, dst_names = G.unrenumber(df, "dst", get_column_names=True)
+
+    if G.edgelist.weights:
+        result_graph.from_cudf_edgelist(
+            df, source=src_names, destination=dst_names,
+            edge_attr="weight"
+        )
+    else:
+        result_graph.from_cudf_edgelist(df, source=src_names,
+                                        destination=dst_names)
+    return _convert_graph_to_output_type(result_graph, input_type)
+
+
+def batched_ego_graphs(
+    G, seeds, radius=1, center=True, undirected=False, distance=None
+):
+    """
+    Compute the  induced subgraph of neighbors for each node in seeds
+    within a given radius.
+
+    Parameters
+    ----------
+    G : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix
+        Graph or matrix object, which should contain the connectivity
+        information. Edge weights, if present, should be single or double
+        precision floating point values.
+    seeds : cudf.Series or list or cudf.DataFrame
+        Specifies the seeds of the induced egonet subgraphs.
+    radius: integer, optional
+        Include all neighbors of distance<=radius from n.
+    center: bool, optional
+        Defaults to True. False is not supported
+    undirected: bool, optional
+        Defaults to False. True is not supported
+    distance: key, optional
+        Distances are counted in hops from n. Other cases are not supported.
+
+    Returns
+    -------
+    ego_edge_lists : cudf.DataFrame or pandas.DataFrame
+        GPU data frame containing all induced sources identifiers,
+        destination identifiers, edge weights
+    seeds_offsets: cudf.Series
+        Series containing the starting offset in the returned edge list
+        for each seed.
+    """
+
+    (G, input_type) = ensure_cugraph_obj(G, nx_weight_attr="weight")
+
+    if G.renumbered is True:
+        if isinstance(seeds, cudf.DataFrame):
+            seeds = G.lookup_internal_vertex_id(seeds, seeds.columns)
+        else:
+            seeds = G.lookup_internal_vertex_id(cudf.Series(seeds))
+
+    df, offsets = egonet_wrapper.egonet(G, seeds, radius)
+
+    if G.renumbered:
+        df = G.unrenumber(df, "src", preserve_order=True)
+        df = G.unrenumber(df, "dst", preserve_order=True)
+
+    return _convert_df_series_to_output_type(df, offsets, input_type)
diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx
new file mode 100644
index 00000000000..418fc50b712
--- /dev/null
+++ b/python/cugraph/community/egonet_wrapper.pyx
@@ -0,0 +1,124 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.community.egonet cimport call_egonet
+from cugraph.structure.graph_utilities cimport *
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+from cugraph.structure import graph_primtypes_wrapper
+import cudf
+import numpy as np
+from rmm._lib.device_buffer cimport DeviceBuffer
+from cudf.core.buffer import Buffer
+
+
+def egonet(input_graph, vertices, radius=1):
+    """
+    Call egonet
+    """
+    # FIXME: Offsets and indices are currently hardcoded to int, but this may
+    #        not be acceptable in the future.
+    numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                     np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                     np.dtype("float32") : <int>numberTypeEnum.floatType,
+                     np.dtype("double") : <int>numberTypeEnum.doubleType}
+
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32])
+    vertex_t = src.dtype
+    edge_t = np.dtype("int32")
+    weights = None
+    if input_graph.edgelist.weights:
+        weights = input_graph.edgelist.edgelist_df['weights']
+
+    num_verts = input_graph.number_of_vertices()
+    num_edges = input_graph.number_of_edges(directed_edges=True)
+    num_local_edges = num_edges
+
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
+        weight_t = weights.dtype
+        is_weighted = True
+    else:
+        weight_t = np.dtype("float32")
+        is_weighted = False
+
+    is_symmetric = not input_graph.is_directed()
+
+    # Pointers for egonet
+    vertices = vertices.astype('int32')
+    cdef uintptr_t c_source_vertex_ptr = vertices.__cuda_array_interface__['data'][0]
+    n_subgraphs = vertices.size
+    n_streams = 1
+    if n_subgraphs > 1 :
+        n_streams = min(n_subgraphs, 32)
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t(n_streams))
+    handle_ = handle_ptr.get();
+
+    cdef graph_container_t graph_container
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <void*>NULL,
+                             0,
+                             <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_local_edges,
+                             num_verts,
+                             num_edges,
+                             is_weighted,
+                             is_symmetric,
+                             False, False) 
+
+    if(weight_t==np.dtype("float32")):
+        el_struct_ptr = move(call_egonet[int, float](handle_[0],
+                               graph_container,
+                               <int*> c_source_vertex_ptr,
+                               <int> n_subgraphs,
+                               <int> radius))
+    else:
+        el_struct_ptr = move(call_egonet[int, double](handle_[0],
+                               graph_container,
+                               <int*> c_source_vertex_ptr,
+                               <int> n_subgraphs,
+                               <int> radius))
+        
+    el_struct = move(el_struct_ptr.get()[0])
+    src = DeviceBuffer.c_from_unique_ptr(move(el_struct.src_indices))
+    dst = DeviceBuffer.c_from_unique_ptr(move(el_struct.dst_indices))
+    wgt = DeviceBuffer.c_from_unique_ptr(move(el_struct.edge_data))
+    src = Buffer(src)
+    dst = Buffer(dst)
+    wgt = Buffer(wgt)
+
+    src = cudf.Series(data=src, dtype=vertex_t)
+    dst = cudf.Series(data=dst, dtype=vertex_t)
+
+    df = cudf.DataFrame()
+    df['src'] = src
+    df['dst'] = dst
+    if wgt.nbytes != 0:
+        wgt = cudf.Series(data=wgt, dtype=weight_t)
+        df['weight'] = wgt
+
+    offsets = DeviceBuffer.c_from_unique_ptr(move(el_struct.subgraph_offsets))
+    offsets = Buffer(offsets)
+    offsets = cudf.Series(data=offsets, dtype="int")
+
+    return df, offsets
+
diff --git a/python/cugraph/community/ktruss_subgraph.pxd b/python/cugraph/community/ktruss_subgraph.pxd
index ab3a5189414..d993c31c375 100644
--- a/python/cugraph/community/ktruss_subgraph.pxd
+++ b/python/cugraph/community/ktruss_subgraph.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef unique_ptr[GraphCOO[VT,ET,WT]] k_truss_subgraph[VT,ET,WT](
         const GraphCOOView[VT,ET,WT] &graph,
diff --git a/python/cugraph/community/ktruss_subgraph.py b/python/cugraph/community/ktruss_subgraph.py
index 8e4f1471955..afa7d66d31d 100644
--- a/python/cugraph/community/ktruss_subgraph.py
+++ b/python/cugraph/community/ktruss_subgraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,15 +12,36 @@
 # limitations under the License.
 
 from cugraph.community import ktruss_subgraph_wrapper
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.utilities import check_nx_graph
 from cugraph.utilities import cugraph_to_nx
 
+from numba import cuda
+
+
+# FIXME: special case for ktruss on CUDA 11.4: an 11.4 bug causes ktruss to
+# crash in that environment. Allow ktruss to import on non-11.4 systems, but
+# raise an exception if ktruss is directly imported on 11.4.
+def _ensure_compatible_cuda_version():
+    try:
+        cuda_version = cuda.runtime.get_version()
+    except cuda.cudadrv.runtime.CudaRuntimeAPIError:
+        cuda_version = "n/a"
+
+    unsupported_cuda_version = (11, 4)
+
+    if cuda_version == unsupported_cuda_version:
+        ver_string = ".".join([str(n) for n in unsupported_cuda_version])
+        raise NotImplementedError("k_truss is not currently supported in CUDA"
+                                  f" {ver_string} environments.")
+
 
 def k_truss(G, k):
     """
     Returns the K-Truss subgraph of a graph for a specific k.
 
+    NOTE: this function is currently not available on CUDA 11.4 systems.
+
     The k-truss of a graph is a subgraph where each edge is part of at least
     (k−2) triangles. K-trusses are used for finding tighlty knit groups of
     vertices in a graph. A k-truss is a relaxation of a k-clique in the graph
@@ -44,6 +65,8 @@ def k_truss(G, k):
         The networkx graph will NOT have all attributes copied over
     """
 
+    _ensure_compatible_cuda_version()
+
     G, isNx = check_nx_graph(G)
 
     if isNx is True:
@@ -60,6 +83,8 @@ def ktruss_subgraph(G, k, use_weights=True):
     """
     Returns the K-Truss subgraph of a graph for a specific k.
 
+    NOTE: this function is currently not available on CUDA 11.4 systems.
+
     The k-truss of a graph is a subgraph where each edge is part of at least
     (k−2) triangles. K-trusses are used for finding tighlty knit groups of
     vertices in a graph. A k-truss is a relaxation of a k-clique in the graph
@@ -117,6 +142,8 @@ def ktruss_subgraph(G, k, use_weights=True):
     >>> k_subgraph = cugraph.ktruss_subgraph(G, 3)
     """
 
+    _ensure_compatible_cuda_version()
+
     KTrussSubgraph = Graph()
     if type(G) is not Graph:
         raise Exception("input graph must be undirected")
diff --git a/python/cugraph/community/ktruss_subgraph_wrapper.pyx b/python/cugraph/community/ktruss_subgraph_wrapper.pyx
index 9f8138f4d57..d3b7a38ba41 100644
--- a/python/cugraph/community/ktruss_subgraph_wrapper.pyx
+++ b/python/cugraph/community/ktruss_subgraph_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,12 +19,6 @@
 from cugraph.community.ktruss_subgraph cimport *
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
-from libcpp cimport bool
-from libc.stdint cimport uintptr_t
-from libc.float cimport FLT_MAX_EXP
-
-import cudf
-import rmm
 import numpy as np
 
 
@@ -39,6 +33,10 @@ def ktruss_subgraph_double(input_graph, k, use_weights):
 
 
 def ktruss_subgraph(input_graph, k, use_weights):
+    [input_graph.edgelist.edgelist_df['src'],
+     input_graph.edgelist.edgelist_df['dst']] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
+                                                                                       input_graph.edgelist.edgelist_df['dst']],
+                                                                                      [np.int32])
     if graph_primtypes_wrapper.weight_type(input_graph) == np.float64 and use_weights:
         return ktruss_subgraph_double(input_graph, k, use_weights)
     else:
diff --git a/python/cugraph/community/leiden.pxd b/python/cugraph/community/leiden.pxd
index 80e0e12f65a..871dc826c06 100644
--- a/python/cugraph/community/leiden.pxd
+++ b/python/cugraph/community/leiden.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,7 +21,7 @@ from libcpp.utility cimport pair
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef pair[size_t, weight_t] leiden[vertex_t,edge_t,weight_t](
         const handle_t &handle,
diff --git a/python/cugraph/community/leiden.py b/python/cugraph/community/leiden.py
index 8c1b79b8b63..641cf552192 100644
--- a/python/cugraph/community/leiden.py
+++ b/python/cugraph/community/leiden.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 - 2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.community import leiden_wrapper
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.utilities import check_nx_graph
 from cugraph.utilities import df_score_to_dictionary
 
diff --git a/python/cugraph/community/leiden_wrapper.pyx b/python/cugraph/community/leiden_wrapper.pyx
index 70fcfcf701b..1b41134c625 100644
--- a/python/cugraph/community/leiden_wrapper.pyx
+++ b/python/cugraph/community/leiden_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,7 +22,6 @@ from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
-import rmm
 import numpy as np
 
 
diff --git a/python/cugraph/community/louvain.pxd b/python/cugraph/community/louvain.pxd
index eca15ba3d20..08625047285 100644
--- a/python/cugraph/community/louvain.pxd
+++ b/python/cugraph/community/louvain.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,10 +18,10 @@
 
 
 from libcpp.utility cimport pair
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef pair[size_t, weight_t] call_louvain[weight_t](
         const handle_t &handle,
diff --git a/python/cugraph/community/louvain.py b/python/cugraph/community/louvain.py
index d4d56a1100c..a761e060038 100644
--- a/python/cugraph/community/louvain.py
+++ b/python/cugraph/community/louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.community import louvain_wrapper
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.utilities import check_nx_graph
 from cugraph.utilities import df_score_to_dictionary
 
diff --git a/python/cugraph/community/louvain_wrapper.pyx b/python/cugraph/community/louvain_wrapper.pyx
index 6b218a0b962..c7ce4e8db66 100644
--- a/python/cugraph/community/louvain_wrapper.pyx
+++ b/python/cugraph/community/louvain_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,12 +17,11 @@
 # cython: language_level = 3
 
 from cugraph.community cimport louvain as c_louvain
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
-import rmm
 import numpy as np
 
 
diff --git a/python/cugraph/community/spectral_clustering.pxd b/python/cugraph/community/spectral_clustering.pxd
index 27ce6130b05..346eb50a157 100644
--- a/python/cugraph/community/spectral_clustering.pxd
+++ b/python/cugraph/community/spectral_clustering.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph::ext_raft":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph::ext_raft":
 
     cdef void balancedCutClustering[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
diff --git a/python/cugraph/community/spectral_clustering.py b/python/cugraph/community/spectral_clustering.py
index b5f175e8237..06294af00c9 100644
--- a/python/cugraph/community/spectral_clustering.py
+++ b/python/cugraph/community/spectral_clustering.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,23 +32,23 @@ def spectralBalancedCutClustering(
     Parameters
     ----------
     G : cugraph.Graph or networkx.Graph
-        cuGraph graph descriptor
+         graph descriptor
     num_clusters : integer
-         Specifies the number of clusters to find
+         Specifies the number of clusters to find, must be greater than 1
     num_eigen_vects : integer
          Specifies the number of eigenvectors to use. Must be lower or equal to
-         num_clusters.
+         num_clusters.   Default is 2
     evs_tolerance: float
-         Specifies the tolerance to use in the eigensolver
+         Specifies the tolerance to use in the eigensolver.
          Default is 0.00001
     evs_max_iter: integer
-         Specifies the maximum number of iterations for the eigensolver
+         Specifies the maximum number of iterations for the eigensolver.
          Default is 100
     kmean_tolerance: float
-         Specifies the tolerance to use in the k-means solver
+         Specifies the tolerance to use in the k-means solver.
          Default is 0.00001
     kmean_max_iter: integer
-         Specifies the maximum number of iterations for the k-means solver
+         Specifies the maximum number of iterations for the k-means solver.
          Default is 100
 
     Returns
@@ -73,6 +73,8 @@ def spectralBalancedCutClustering(
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
     """
 
+    # Error checking in C++ code
+
     G, isNx = check_nx_graph(G)
 
     df = spectral_clustering_wrapper.spectralBalancedCutClustering(
@@ -109,24 +111,24 @@ def spectralModularityMaximizationClustering(
 
     Parameters
     ----------
-    G : cugraph.Graph
+    G : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor. This graph should have edge weights.
     num_clusters : integer
          Specifies the number of clusters to find
     num_eigen_vects : integer
          Specifies the number of eigenvectors to use. Must be lower or equal to
-         num_clusters
+         num_clusters.  Default is 2
     evs_tolerance: float
-         Specifies the tolerance to use in the eigensolver
+         Specifies the tolerance to use in the eigensolver.
          Default is 0.00001
     evs_max_iter: integer
-         Specifies the maximum number of iterations for the eigensolver
+         Specifies the maximum number of iterations for the eigensolver.
          Default is 100
     kmean_tolerance: float
-         Specifies the tolerance to use in the k-means solver
+         Specifies the tolerance to use in the k-means solver.
          Default is 0.00001
     kmean_max_iter: integer
-         Specifies the maximum number of iterations for the k-means solver
+         Specifies the maximum number of iterations for the k-means solver.
          Default is 100
 
     Returns
@@ -148,6 +150,8 @@ def spectralModularityMaximizationClustering(
     >>> df = cugraph.spectralModularityMaximizationClustering(G, 5)
     """
 
+    # Error checking in C++ code
+
     G, isNx = check_nx_graph(G)
 
     df = spectral_clustering_wrapper.spectralModularityMaximizationClustering(
@@ -173,18 +177,21 @@ def analyzeClustering_modularity(G, n_clusters, clustering,
                                  vertex_col_name='vertex',
                                  cluster_col_name='cluster'):
     """
-    Compute the modularity score for a partitioning/clustering
+    Compute the modularity score for a given partitioning/clustering.
+    The assumption is that “clustering” is the results from a call
+    from a special clustering algorithm and contains columns named
+    “vertex” and “cluster”.
 
     Parameters
     ----------
-    G : cugraph.Graph
-        cuGraph graph descriptor. This graph should have edge weights.
+    G : cugraph.Graph or networkx.Graph
+        graph descriptor. This graph should have edge weights.
     n_clusters : integer
         Specifies the number of clusters in the given clustering
     clustering : cudf.DataFrame
         The cluster assignment to analyze.
-    vertex_col_name : str
-        The name of the column in the clustering dataframe identifying
+    vertex_col_name : str or list of str
+        The names of the column in the clustering dataframe identifying
         the external vertex id
     cluster_col_name : str
         The name of the column in the clustering dataframe identifying
@@ -204,17 +211,26 @@ def analyzeClustering_modularity(G, n_clusters, clustering,
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
-    >>> score = cugraph.analyzeClustering_modularity(G, 5, df,
-    >>>   'vertex', 'cluster')
+    >>> score = cugraph.analyzeClustering_modularity(G, 5, df)
     """
+    if type(vertex_col_name) is list:
+        if not all(isinstance(name, str) for name in vertex_col_name):
+            raise Exception("vertex_col_name must be list of string")
+    elif type(vertex_col_name) is not str:
+        raise Exception("vertex_col_name must be a string")
+
+    if type(cluster_col_name) is not str:
+        raise Exception("cluster_col_name must be a string")
+
+    G, isNx = check_nx_graph(G)
 
     if G.renumbered:
         clustering = G.add_internal_vertex_id(clustering,
-                                              vertex_col_name,
+                                              'vertex',
                                               vertex_col_name,
                                               drop=True)
 
-    clustering = clustering.sort_values(vertex_col_name)
+    clustering = clustering.sort_values('vertex')
 
     score = spectral_clustering_wrapper.analyzeClustering_modularity(
         G, n_clusters, clustering[cluster_col_name]
@@ -228,6 +244,9 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,
                                cluster_col_name='cluster'):
     """
     Compute the edge cut score for a partitioning/clustering
+    The assumption is that “clustering” is the results from a call
+    from a special clustering algorithm and contains columns named
+    “vertex” and “cluster”.
 
     Parameters
     ----------
@@ -258,19 +277,26 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None)
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
-    >>> score = cugraph.analyzeClustering_edge_cut(G, 5, df,
-    >>>   'vertex', 'cluster')
+    >>> score = cugraph.analyzeClustering_edge_cut(G, 5, df)
     """
+    if type(vertex_col_name) is list:
+        if not all(isinstance(name, str) for name in vertex_col_name):
+            raise Exception("vertex_col_name must be list of string")
+    elif type(vertex_col_name) is not str:
+        raise Exception("vertex_col_name must be a string")
+
+    if type(cluster_col_name) is not str:
+        raise Exception("cluster_col_name must be a string")
 
     G, isNx = check_nx_graph(G)
 
     if G.renumbered:
         clustering = G.add_internal_vertex_id(clustering,
-                                              vertex_col_name,
+                                              'vertex',
                                               vertex_col_name,
                                               drop=True)
 
-    clustering = clustering.sort_values(vertex_col_name).reset_index(drop=True)
+    clustering = clustering.sort_values('vertex').reset_index(drop=True)
 
     score = spectral_clustering_wrapper.analyzeClustering_edge_cut(
         G, n_clusters, clustering[cluster_col_name]
@@ -317,14 +343,22 @@ def analyzeClustering_ratio_cut(G, n_clusters, clustering,
     >>> score = cugraph.analyzeClustering_ratio_cut(G, 5, df,
     >>>   'vertex', 'cluster')
     """
+    if type(vertex_col_name) is list:
+        if not all(isinstance(name, str) for name in vertex_col_name):
+            raise Exception("vertex_col_name must be list of string")
+    elif type(vertex_col_name) is not str:
+        raise Exception("vertex_col_name must be a string")
+
+    if type(cluster_col_name) is not str:
+        raise Exception("cluster_col_name must be a string")
 
     if G.renumbered:
         clustering = G.add_internal_vertex_id(clustering,
-                                              vertex_col_name,
+                                              'vertex',
                                               vertex_col_name,
                                               drop=True)
 
-    clustering = clustering.sort_values(vertex_col_name)
+    clustering = clustering.sort_values('vertex')
 
     score = spectral_clustering_wrapper.analyzeClustering_ratio_cut(
         G, n_clusters, clustering[cluster_col_name]
diff --git a/python/cugraph/community/spectral_clustering_wrapper.pyx b/python/cugraph/community/spectral_clustering_wrapper.pyx
index 0593d987c0d..7934a386bb7 100644
--- a/python/cugraph/community/spectral_clustering_wrapper.pyx
+++ b/python/cugraph/community/spectral_clustering_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -23,12 +23,9 @@ from cugraph.community.spectral_clustering cimport analyzeClustering_edge_cut as
 from cugraph.community.spectral_clustering cimport analyzeClustering_ratio_cut as c_analyze_clustering_ratio_cut
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
-from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-
 import cugraph
 import cudf
-import rmm
 import numpy as np
 
 
diff --git a/python/cugraph/community/subgraph_extraction.pxd b/python/cugraph/community/subgraph_extraction.pxd
index 97a71056006..583e220327d 100644
--- a/python/cugraph/community/subgraph_extraction.pxd
+++ b/python/cugraph/community/subgraph_extraction.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@ from cugraph.structure.graph_primtypes cimport *
 from libcpp.memory cimport unique_ptr
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph::subgraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph::subgraph":
 
     cdef unique_ptr[GraphCOO[VT,ET,WT]] extract_subgraph_vertex[VT,ET,WT](
         const GraphCOOView[VT,ET,WT] &graph,
diff --git a/python/cugraph/community/subgraph_extraction.py b/python/cugraph/community/subgraph_extraction.py
index 8c702c2f58f..2df6e037d71 100644
--- a/python/cugraph/community/subgraph_extraction.py
+++ b/python/cugraph/community/subgraph_extraction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,8 +12,8 @@
 # limitations under the License.
 
 from cugraph.community import subgraph_extraction_wrapper
-from cugraph.structure.graph import null_check
 from cugraph.utilities import check_nx_graph
+import cudf
 from cugraph.utilities import cugraph_to_nx
 
 
@@ -28,8 +28,9 @@ def subgraph(G, vertices):
     ----------
     G : cugraph.Graph
         cuGraph graph descriptor
-    vertices : cudf.Series
-        Specifies the vertices of the induced subgraph
+    vertices : cudf.Series or cudf.DataFrame
+        Specifies the vertices of the induced subgraph. For multi-column
+        vertices, vertices should be provided as a cudf.DataFrame
 
     Returns
     -------
@@ -52,27 +53,30 @@ def subgraph(G, vertices):
     >>> Sg = cugraph.subgraph(G, sverts)
     """
 
-    null_check(vertices)
-
     G, isNx = check_nx_graph(G)
 
     if G.renumbered:
-        vertices = G.lookup_internal_vertex_id(vertices)
+        if isinstance(vertices, cudf.DataFrame):
+            vertices = G.lookup_internal_vertex_id(vertices, vertices.columns)
+        else:
+            vertices = G.lookup_internal_vertex_id(vertices)
 
     result_graph = type(G)()
 
     df = subgraph_extraction_wrapper.subgraph(G, vertices)
 
     if G.renumbered:
-        df = G.unrenumber(df, "src")
-        df = G.unrenumber(df, "dst")
+        df, src_names = G.unrenumber(df, "src", get_column_names=True)
+        df, dst_names = G.unrenumber(df, "dst", get_column_names=True)
 
     if G.edgelist.weights:
         result_graph.from_cudf_edgelist(
-            df, source="src", destination="dst", edge_attr="weight"
+            df, source=src_names, destination=dst_names,
+            edge_attr="weight"
         )
     else:
-        result_graph.from_cudf_edgelist(df, source="src", destination="dst")
+        result_graph.from_cudf_edgelist(df, source=src_names,
+                                        destination=dst_names)
 
     if isNx is True:
         result_graph = cugraph_to_nx(result_graph)
diff --git a/python/cugraph/community/subgraph_extraction_wrapper.pyx b/python/cugraph/community/subgraph_extraction_wrapper.pyx
index 5dbb6ce1e27..46dc5c07eaf 100644
--- a/python/cugraph/community/subgraph_extraction_wrapper.pyx
+++ b/python/cugraph/community/subgraph_extraction_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,9 +20,7 @@ from cugraph.community.subgraph_extraction cimport extract_subgraph_vertex as c_
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
-
 import cudf
-import rmm
 import numpy as np
 
 
@@ -45,6 +43,10 @@ def subgraph(input_graph, vertices):
         if weights.dtype == np.float64:
             use_float = False
 
+    num_verts = input_graph.number_of_vertices()
+    num_edges = len(src)
+    num_input_vertices = len(vertices)
+
     cdef GraphCOOView[int,int,float]  in_graph_float
     cdef GraphCOOView[int,int,double] in_graph_double
     cdef unique_ptr[GraphCOO[int,int,float]]  out_graph_float
@@ -57,12 +59,9 @@ def subgraph(input_graph, vertices):
     if weights is not None:
         c_weights = weights.__cuda_array_interface__['data'][0]
 
+    [vertices] = graph_primtypes_wrapper.datatype_cast([vertices], [np.int32])
     cdef uintptr_t c_vertices = vertices.__cuda_array_interface__['data'][0]
 
-    num_verts = input_graph.number_of_vertices()
-    num_edges = len(src)
-    num_input_vertices = len(vertices)
-
     if use_float:
         in_graph_float = GraphCOOView[int,int,float](<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges);
         df = coo_to_df(move(c_extract_subgraph_vertex(in_graph_float, <int*>c_vertices, <int>num_input_vertices)));
diff --git a/python/cugraph/community/triangle_count.pxd b/python/cugraph/community/triangle_count.pxd
index 70795a3f43a..55e8114ccbf 100644
--- a/python/cugraph/community/triangle_count.pxd
+++ b/python/cugraph/community/triangle_count.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@ from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uint64_t
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph::triangle":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph::triangle":
 
     cdef uint64_t triangle_count[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph) except +
diff --git a/python/cugraph/community/triangle_count.py b/python/cugraph/community/triangle_count.py
index ff4dc9a5c5f..d28424a513e 100644
--- a/python/cugraph/community/triangle_count.py
+++ b/python/cugraph/community/triangle_count.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.community import triangle_count_wrapper
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.utilities import check_nx_graph
 
 
diff --git a/python/cugraph/community/triangle_count_wrapper.pyx b/python/cugraph/community/triangle_count_wrapper.pyx
index d7cabd4676f..f1e842f9de4 100644
--- a/python/cugraph/community/triangle_count_wrapper.pyx
+++ b/python/cugraph/community/triangle_count_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,9 +21,7 @@ from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 import numpy as np
-
 import cudf
-import rmm
 
 
 def triangles(input_graph):
diff --git a/python/cugraph/components/connectivity.pxd b/python/cugraph/components/connectivity.pxd
index 94fa165969d..678836216b9 100644
--- a/python/cugraph/components/connectivity.pxd
+++ b/python/cugraph/components/connectivity.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,9 +17,10 @@
 # cython: language_level = 3
 
 from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     ctypedef enum cugraph_cc_t:
         CUGRAPH_WEAK "cugraph::cugraph_cc_t::CUGRAPH_WEAK"
@@ -30,3 +31,9 @@ cdef extern from "algorithms.hpp" namespace "cugraph":
         const GraphCSRView[VT,ET,WT] &graph,
         cugraph_cc_t connect_type,
         VT *labels) except +
+
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+    cdef void call_wcc[vertex_t, weight_t](
+        const handle_t &handle,
+        const graph_container_t &g,
+        vertex_t *identifiers) except +
diff --git a/python/cugraph/components/connectivity.py b/python/cugraph/components/connectivity.py
index 7c68afd7ced..94eea312fb9 100644
--- a/python/cugraph/components/connectivity.py
+++ b/python/cugraph/components/connectivity.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -138,8 +138,10 @@ def weakly_connected_components(G,
 
     directed : bool, optional
 
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only.
+            Raises TypeError if used with a Graph object.
+
         If True (default), then convert the input matrix to a cugraph.DiGraph
         and only move from point i to point j along paths csgraph[i, j]. If
         False, then find the shortest path on an undirected graph: the
@@ -154,8 +156,10 @@ def weakly_connected_components(G,
 
     return_labels : bool, optional
 
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only. Raises
+            TypeError if used with a Graph object.
+
         If True (default), then return the labels for each of the connected
         components.
 
@@ -231,8 +235,10 @@ def strongly_connected_components(G,
 
     directed : bool, optional
 
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only.
+            Raises TypeError if used with a Graph object.
+
         If True (default), then convert the input matrix to a cugraph.DiGraph
         and only move from point i to point j along paths csgraph[i, j]. If
         False, then find the shortest path on an undirected graph: the
@@ -247,8 +253,10 @@ def strongly_connected_components(G,
 
     return_labels : bool, optional
 
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only. Raises
+            TypeError if used with a Graph object.
+
         If True (default), then return the labels for each of the connected
         components.
 
@@ -325,8 +333,10 @@ def connected_components(G,
 
     directed : bool, optional
 
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only. Raises
+            TypeError if used with a Graph object.
+
         If True (default), then convert the input matrix to a cugraph.DiGraph
         and only move from point i to point j along paths csgraph[i, j]. If
         False, then find the shortest path on an undirected graph: the
@@ -340,8 +350,10 @@ def connected_components(G,
 
     return_labels : bool, optional
 
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only. Raises
+            TypeError if used with a Graph object.
+
         If True (default), then return the labels for each of the connected
         components.
 
@@ -378,7 +390,7 @@ def connected_components(G,
                           header=None)
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None)
-    >>> df = cugraph.strongly_connected_components(G)
+    >>> df = cugraph.connected_components(G, connection="weak")
     """
     if connection == "weak":
         return weakly_connected_components(G, directed,
diff --git a/python/cugraph/components/connectivity_wrapper.pyx b/python/cugraph/components/connectivity_wrapper.pyx
index 8b678d16ff8..28227bd1c07 100644
--- a/python/cugraph/components/connectivity_wrapper.pyx
+++ b/python/cugraph/components/connectivity_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,12 +18,12 @@
 
 from cugraph.components.connectivity cimport *
 from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from cugraph.structure import utils_wrapper
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cugraph.structure.symmetrize import symmetrize
-from cugraph.structure.graph import Graph as type_Graph
-
+from cugraph.structure.graph_classes import Graph as type_Graph
 import cudf
 import numpy as np
 
@@ -31,47 +31,60 @@ def weakly_connected_components(input_graph):
     """
     Call connected_components
     """
-    offsets = None
-    indices = None
 
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get()
+
+    numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                     np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                     np.dtype("float32") : <int>numberTypeEnum.floatType,
+                     np.dtype("double") : <int>numberTypeEnum.doubleType}
+
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
+                                                       input_graph.edgelist.edgelist_df['dst']],
+                                                       [np.int32])
     if type(input_graph) is not type_Graph:
         #
-        # Need to create a symmetrized CSR for this local
-        # computation, don't want to keep it.
-        #
-        [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
-                                                            input_graph.edgelist.edgelist_df['dst']],
-                                                           [np.int32])
-        src, dst = symmetrize(src, dst)
-        [offsets, indices] = utils_wrapper.coo2csr(src, dst)[0:2]
-    else:
-        if not input_graph.adjlist:
-            input_graph.view_adj_list()
-
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                                    input_graph.adjlist.indices],
-                                                                   [np.int32])
+        # Need to create a symmetrized COO for this local
+        # computation
 
+        src, dst = symmetrize(src, dst)
+    weight_t = np.dtype("float32")
+    weights = None
+            
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     df = cudf.DataFrame()
-    df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32))
     df['labels'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
 
-    cdef uintptr_t c_offsets    = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices    = indices.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0];
+    cdef uintptr_t c_src_vertices    = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices    = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
     cdef uintptr_t c_labels_val = df['labels'].__cuda_array_interface__['data'][0];
 
-    cdef GraphCSRView[int,int,float] g
-
-    g = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices, <float*>NULL, num_verts, num_edges)
-
-    cdef cugraph_cc_t connect_type=CUGRAPH_WEAK
-    connected_components(g, <cugraph_cc_t>connect_type, <int *>c_labels_val)
-
-    g.get_vertex_identifiers(<int*>c_identifier)
+    cdef graph_container_t graph_container
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <void*>NULL,
+                             0,
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_edges,
+                             num_verts, num_edges,
+                             False,
+                             True,
+                             False,
+                             False)
+
+    call_wcc[int, float](handle_ptr.get()[0],
+                         graph_container,
+                         <int*> c_labels_val)
 
     return df
 
diff --git a/python/cugraph/cores/core_number.pxd b/python/cugraph/cores/core_number.pxd
index cf28720a3e8..17dc1118a5e 100644
--- a/python/cugraph/cores/core_number.pxd
+++ b/python/cugraph/cores/core_number.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,7 @@
 
 from cugraph.structure.graph_primtypes cimport *
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef void core_number[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
diff --git a/python/cugraph/cores/core_number_wrapper.pyx b/python/cugraph/cores/core_number_wrapper.pyx
index 3df1df5f8e9..9fcc3b4746c 100644
--- a/python/cugraph/cores/core_number_wrapper.pyx
+++ b/python/cugraph/cores/core_number_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,9 +20,7 @@ cimport cugraph.cores.core_number as c_core
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
-
 import cudf
-import rmm
 import numpy as np
 
 
diff --git a/python/cugraph/cores/k_core.pxd b/python/cugraph/cores/k_core.pxd
index 556dbc95ed9..1d22e7ac4d2 100644
--- a/python/cugraph/cores/k_core.pxd
+++ b/python/cugraph/cores/k_core.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,7 @@
 
 from cugraph.structure.graph_primtypes cimport *
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef unique_ptr[GraphCOO[VT,ET,WT]] k_core[VT,ET,WT](
         const GraphCOOView[VT,ET,WT] &in_graph,
diff --git a/python/cugraph/cores/k_core.py b/python/cugraph/cores/k_core.py
index ce67665764b..17a3baf9c4c 100644
--- a/python/cugraph/cores/k_core.py
+++ b/python/cugraph/cores/k_core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 from cugraph.cores import k_core_wrapper, core_number_wrapper
 from cugraph.utilities import cugraph_to_nx
 from cugraph.utilities import check_nx_graph
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 
 
 def k_core(G, k=None, core_number=None):
@@ -69,31 +69,38 @@ def k_core(G, k=None, core_number=None):
 
     if core_number is not None:
         if G.renumbered is True:
-            core_number = G.add_internal_vertex_id(
-                core_number, "vertex", "vertex", drop=True
-            )
+            if len(G.renumber_map.implementation.col_names) > 1:
+                cols = core_number.columns[:-1].to_list()
+            else:
+                cols = 'vertex'
+            core_number = G.add_internal_vertex_id(core_number, 'vertex',
+                                                   cols)
+
     else:
         core_number = core_number_wrapper.core_number(G)
         core_number = core_number.rename(
             columns={"core_number": "values"}, copy=False
         )
-    print(core_number)
+
     if k is None:
         k = core_number["values"].max()
 
     k_core_df = k_core_wrapper.k_core(G, k, core_number)
 
     if G.renumbered:
-        k_core_df = G.unrenumber(k_core_df, "src")
-        k_core_df = G.unrenumber(k_core_df, "dst")
+        k_core_df, src_names = G.unrenumber(k_core_df, "src",
+                                            get_column_names=True)
+        k_core_df, dst_names = G.unrenumber(k_core_df, "dst",
+                                            get_column_names=True)
 
     if G.edgelist.weights:
         KCoreGraph.from_cudf_edgelist(
-            k_core_df, source="src", destination="dst", edge_attr="weight"
+            k_core_df, source=src_names, destination=dst_names,
+            edge_attr="weight"
         )
     else:
         KCoreGraph.from_cudf_edgelist(
-            k_core_df, source="src", destination="dst"
+            k_core_df, source=src_names, destination=dst_names,
         )
 
     if isNx is True:
diff --git a/python/cugraph/cores/k_core_wrapper.pyx b/python/cugraph/cores/k_core_wrapper.pyx
index 51ecec09dc5..28bb191f4f4 100644
--- a/python/cugraph/cores/k_core_wrapper.pyx
+++ b/python/cugraph/cores/k_core_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,12 +19,7 @@
 from cugraph.cores.k_core cimport k_core as c_k_core
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
-from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-from libc.float cimport FLT_MAX_EXP
-
-import cudf
-import rmm
 import numpy as np
 
 
@@ -54,6 +49,10 @@ def k_core(input_graph, k, core_number):
     """
     Call k_core
     """
+    [input_graph.edgelist.edgelist_df['src'],
+     input_graph.edgelist.edgelist_df['dst']] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
+                                                                                       input_graph.edgelist.edgelist_df['dst']],
+                                                                                      [np.int32])
     if graph_primtypes_wrapper.weight_type(input_graph) == np.float64:
         return k_core_double(input_graph, k, core_number)
     else:
diff --git a/python/cugraph/dask/__init__.py b/python/cugraph/dask/__init__.py
index 830de45c50b..60aebaf19b0 100644
--- a/python/cugraph/dask/__init__.py
+++ b/python/cugraph/dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,3 +17,4 @@
 from .common.read_utils import get_chunksize
 from .community.louvain import louvain
 from .centrality.katz_centrality import katz_centrality
+from .components.connectivity import weakly_connected_components
diff --git a/python/cugraph/dask/centrality/katz_centrality.py b/python/cugraph/dask/centrality/katz_centrality.py
index cf6ad95f974..61c1869f974 100644
--- a/python/cugraph/dask/centrality/katz_centrality.py
+++ b/python/cugraph/dask/centrality/katz_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 #
 
 from dask.distributed import wait, default_client
-from cugraph.dask.common.input_utils import get_distributed_data
-from cugraph.structure.shuffle import shuffle
+from cugraph.dask.common.input_utils import (get_distributed_data,
+                                             get_vertex_partition_offsets)
 from cugraph.dask.centrality import\
     mg_katz_centrality_wrapper as mg_katz_centrality
 import cugraph.comms.comms as Comms
@@ -27,6 +27,7 @@ def call_katz_centrality(sID,
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
+                         aggregate_segment_offsets,
                          alpha,
                          beta,
                          max_iter,
@@ -35,12 +36,16 @@ def call_katz_centrality(sID,
                          normalized):
     wid = Comms.get_worker_id(sID)
     handle = Comms.get_handle(sID)
+    local_size = len(aggregate_segment_offsets) // Comms.get_n_workers(sID)
+    segment_offsets = \
+        aggregate_segment_offsets[local_size * wid: local_size * (wid + 1)]
     return mg_katz_centrality.mg_katz_centrality(data[0],
                                                  num_verts,
                                                  num_edges,
                                                  vertex_partition_offsets,
                                                  wid,
                                                  handle,
+                                                 segment_offsets,
                                                  alpha,
                                                  beta,
                                                  max_iter,
@@ -68,14 +73,16 @@ def katz_centrality(input_graph,
         Attenuation factor defaulted to None. If alpha is not specified then
         it is internally calculated as 1/(degree_max) where degree_max is the
         maximum out degree.
-        NOTE : The maximum acceptable value of alpha for convergence
-        alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue
-        of the graph.
-        Since lambda_max is always lesser than or equal to degree_max for a
-        graph, alpha_max will always be greater than or equal to
-        (1/degree_max). Therefore, setting alpha to (1/degree_max) will
-        guarantee that it will never exceed alpha_max thus in turn fulfilling
-        the requirement for convergence.
+
+        NOTE
+            The maximum acceptable value of alpha for convergence
+            alpha_max = 1/(lambda_max) where lambda_max is the largest
+            eigenvalue of the graph.
+            Since lambda_max is always lesser than or equal to degree_max for a
+            graph, alpha_max will always be greater than or equal to
+            (1/degree_max). Therefore, setting alpha to (1/degree_max) will
+            guarantee that it will never exceed alpha_max thus in turn
+            fulfilling the requirement for convergence.
     beta : None
         A weight scalar - currently Not Supported
     max_iter : int
@@ -94,6 +101,7 @@ def katz_centrality(input_graph,
         acceptable.
     nstart : dask_cudf.Dataframe
         GPU Dataframe containing the initial guess for katz centrality
+
         nstart['vertex'] : dask_cudf.Series
             Contains the vertex identifiers
         nstart['values'] : dask_cudf.Series
@@ -115,7 +123,8 @@ def katz_centrality(input_graph,
     Examples
     --------
     >>> import cugraph.dask as dcg
-    >>> Comms.initialize(p2p=True)
+    >>> ... Init a DASK Cluster
+    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
     >>> chunksize = dcg.get_chunksize(input_data_path)
     >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                  delimiter=' ',
@@ -125,7 +134,6 @@ def katz_centrality(input_graph,
     >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                    edge_attr='value')
     >>> pr = dcg.katz_centrality(dg)
-    >>> Comms.destroy()
     """
 
     nstart = None
@@ -133,11 +141,9 @@ def katz_centrality(input_graph,
     client = default_client()
 
     input_graph.compute_renumber_edge_list(transposed=True)
-    (ddf,
-     num_verts,
-     partition_row_size,
-     partition_col_size,
-     vertex_partition_offsets) = shuffle(input_graph, transposed=True)
+    ddf = input_graph.edgelist.edgelist_df
+    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
+    num_verts = vertex_partition_offsets.iloc[-1]
     num_edges = len(ddf)
     data = get_distributed_data(ddf)
 
@@ -147,6 +153,7 @@ def katz_centrality(input_graph,
                             num_verts,
                             num_edges,
                             vertex_partition_offsets,
+                            input_graph.aggregate_segment_offsets,
                             alpha,
                             beta,
                             max_iter,
diff --git a/python/cugraph/dask/centrality/mg_katz_centrality.pxd b/python/cugraph/dask/centrality/mg_katz_centrality.pxd
index 345457b1963..5e30530e92b 100644
--- a/python/cugraph/dask/centrality/mg_katz_centrality.pxd
+++ b/python/cugraph/dask/centrality/mg_katz_centrality.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
 # limitations under the License.
 #
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_katz_centrality[vertex_t, weight_t](
         const handle_t &handle,
diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx
index b8cab4e4286..6160d13507f 100644
--- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx
+++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from cugraph.structure.utils_wrapper import *
 from cugraph.dask.centrality cimport mg_katz_centrality as c_katz_centrality
 import cudf
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
@@ -30,6 +30,7 @@ def mg_katz_centrality(input_df,
                        vertex_partition_offsets,
                        rank,
                        handle,
+                       segment_offsets,
                        alpha=None,
                        beta=None,
                        max_iter=100,
@@ -48,12 +49,16 @@ def mg_katz_centrality(input_df,
     if num_global_edges > (2**31 - 1):
         edge_t = np.dtype("int64")
     else:
-        edge_t = np.dtype("int32")
+        edge_t = vertex_t
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
+        is_weighted = True
+        raise NotImplementedError # FIXME: c_edge_weights is always set to NULL
     else:
+        weights = None
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     if alpha is None:
         alpha = 0.1
@@ -67,27 +72,43 @@ def mg_katz_centrality(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+      c_edge_weights = weights.__cuda_array_interface__['data'][0]
     
     # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C
     vertex_partition_offsets_host = vertex_partition_offsets.values_host
     cdef uintptr_t c_vertex_partition_offsets = vertex_partition_offsets_host.__array_interface__['data'][0]
 
+    cdef vector[int] v_segment_offsets_32
+    cdef vector[long] v_segment_offsets_64
+    cdef uintptr_t c_segment_offsets
+    if (vertex_t == np.dtype("int32")):
+        v_segment_offsets_32 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_32.data()
+    else:
+        v_segment_offsets_64 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_64.data()
+
     cdef graph_container_t graph_container
+
     populate_graph_container(graph_container,
                              handle_[0],
                              <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
                              <void*>c_vertex_partition_offsets,
+                             <void*>c_segment_offsets,
+                             len(segment_offsets) - 1,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
-                             True,
+                             is_weighted,
+                             False,
                              True, True) 
 
     df = cudf.DataFrame()
@@ -97,11 +118,18 @@ def mg_katz_centrality(input_df,
     cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_katz_centralities = df['katz_centrality'].__cuda_array_interface__['data'][0]
 
-    if (df['katz_centrality'].dtype == np.float32):
-        c_katz_centrality.call_katz_centrality[int, float](handle_[0], graph_container, <int*>c_identifier, <float*> c_katz_centralities,
-                                               alpha, beta, tol, max_iter, <bool>0, <bool> normalize)
+    if vertex_t == np.int32:
+        if (df['katz_centrality'].dtype == np.float32):
+            c_katz_centrality.call_katz_centrality[int, float](handle_[0], graph_container, <int*>c_identifier, <float*> c_katz_centralities,
+                                                   alpha, beta, tol, max_iter, <bool>0, <bool> normalize)
+        else:
+            c_katz_centrality.call_katz_centrality[int, double](handle_[0], graph_container, <int*>c_identifier, <double*> c_katz_centralities,
+                                                   alpha, beta, tol, max_iter, <bool>0, <bool> normalize)
     else:
-        c_katz_centrality.call_katz_centrality[int, double](handle_[0], graph_container, <int*>c_identifier, <double*> c_katz_centralities,
-                                               alpha, beta, tol, max_iter, <bool>0, <bool> normalize)
-    
+        if (df['katz_centrality'].dtype == np.float32):
+            c_katz_centrality.call_katz_centrality[long, float](handle_[0], graph_container, <long*>c_identifier, <float*> c_katz_centralities,
+                                                   alpha, beta, tol, max_iter, <bool>0, <bool> normalize)
+        else:
+            c_katz_centrality.call_katz_centrality[long, double](handle_[0], graph_container, <long*>c_identifier, <double*> c_katz_centralities,
+                                                   alpha, beta, tol, max_iter, <bool>0, <bool> normalize)
     return df
diff --git a/python/cugraph/dask/common/input_utils.py b/python/cugraph/dask/common/input_utils.py
index 0140c9f06f9..9d1c28b6c4e 100644
--- a/python/cugraph/dask/common/input_utils.py
+++ b/python/cugraph/dask/common/input_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,8 +21,7 @@
 
 import cugraph.comms.comms as Comms
 from cugraph.raft.dask.common.utils import get_client
-from cugraph.dask.common.part_utils import (_extract_partitions,
-                                            load_balance_func)
+from cugraph.dask.common.part_utils import _extract_partitions
 from dask.distributed import default_client
 from toolz import first
 from functools import reduce
@@ -67,7 +66,7 @@ def get_client(cls, client=None):
     """ Class methods for initalization """
 
     @classmethod
-    def create(cls, data, client=None):
+    def create(cls, data, client=None, batch_enabled=False):
         """
         Creates a distributed data handler instance with the given
         distributed data set(s).
@@ -91,7 +90,8 @@ def create(cls, data, client=None):
         else:
             raise Exception("Graph data must be dask-cudf dataframe")
 
-        gpu_futures = client.sync(_extract_partitions, data, client)
+        gpu_futures = client.sync(
+            _extract_partitions, data, client, batch_enabled=batch_enabled)
         workers = tuple(OrderedDict.fromkeys(map(lambda x: x[0], gpu_futures)))
         return DistributedDataHandler(gpu_futures=gpu_futures, workers=workers,
                                       datatype=datatype, multiple=multiple,
@@ -174,6 +174,14 @@ def calculate_local_data(self, comms, by):
         self.max_vertex_id = max_vid
 
 
+def _get_local_data(df, by):
+    df = df[0]
+    num_local_edges = len(df)
+    local_by_max = df[by].iloc[-1]
+    local_max = df[['src', 'dst']].max().max()
+    return num_local_edges, local_by_max, local_max
+
+
 """ Internal methods, API subject to change """
 
 
@@ -184,11 +192,14 @@ def _workers_to_parts(futures):
     :param futures: list of (worker, part) tuples
     :return:
     """
-    w_to_p_map = OrderedDict()
+    w_to_p_map = OrderedDict.fromkeys(Comms.get_workers())
     for w, p in futures:
-        if w not in w_to_p_map:
+        if w_to_p_map[w] is None:
             w_to_p_map[w] = []
         w_to_p_map[w].append(p)
+    keys_to_delete = [w for (w, p) in w_to_p_map.items() if p is None]
+    for k in keys_to_delete:
+        del w_to_p_map[k]
     return w_to_p_map
 
 
@@ -198,30 +209,9 @@ def get_obj(x): return x[0] if multiple else x
     return total, reduce(lambda a, b: a + b, total)
 
 
-def _get_local_data(df, by):
-    df = df[0]
-    num_local_edges = len(df)
-    local_by_max = df[by].iloc[-1]
-    local_max = df[['src', 'dst']].max().max()
-    return num_local_edges, local_by_max, local_max
-
-
-def get_local_data(input_graph, by, load_balance=True):
-    input_graph.compute_renumber_edge_list(transposed=(by == 'dst'))
-    _ddf = input_graph.edgelist.edgelist_df
-    ddf = _ddf.sort_values(by=by, ignore_index=True)
-
-    if load_balance:
-        ddf = load_balance_func(ddf, by=by)
-
-    comms = Comms.get_comms()
-    data = DistributedDataHandler.create(data=ddf)
-    data.calculate_local_data(comms, by)
-    return data
-
-
-def get_mg_batch_data(dask_cudf_data):
-    data = DistributedDataHandler.create(data=dask_cudf_data)
+def get_mg_batch_data(dask_cudf_data, batch_enabled=False):
+    data = DistributedDataHandler.create(
+        data=dask_cudf_data, batch_enabled=batch_enabled)
     return data
 
 
@@ -232,3 +222,15 @@ def get_distributed_data(input_ddf):
     if data.worker_info is None and comms is not None:
         data.calculate_worker_and_rank_info(comms)
     return data
+
+
+def get_vertex_partition_offsets(input_graph):
+    import cudf
+    renumber_vertex_count = input_graph.renumber_map.implementation.ddf.\
+        map_partitions(len).compute()
+    renumber_vertex_cumsum = renumber_vertex_count.cumsum()
+    vertex_dtype = input_graph.edgelist.edgelist_df['src'].dtype
+    vertex_partition_offsets = cudf.Series([0], dtype=vertex_dtype)
+    vertex_partition_offsets = vertex_partition_offsets.append(cudf.Series(
+        renumber_vertex_cumsum, dtype=vertex_dtype))
+    return vertex_partition_offsets
diff --git a/python/cugraph/dask/common/mg_utils.py b/python/cugraph/dask/common/mg_utils.py
index 7556afb122a..1651a9e800c 100644
--- a/python/cugraph/dask/common/mg_utils.py
+++ b/python/cugraph/dask/common/mg_utils.py
@@ -10,9 +10,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from cugraph.raft.dask.common.utils import default_client
+
+import os
+
 import numba.cuda
 
+from dask_cuda import LocalCUDACluster
+from dask.distributed import Client
+
+from cugraph.raft.dask.common.utils import default_client
+# FIXME: cugraph/__init__.py also imports the comms module, but
+# depending on the import environment, cugraph/comms/__init__.py
+# may be imported instead. The following imports the comms.py
+# module directly
+from cugraph.comms import comms as Comms
+
 
 # FIXME: We currently look for the default client from dask, as such is the
 # if there is a dask client running without any GPU we will still try
@@ -41,3 +53,36 @@ def is_single_gpu():
         return False
     else:
         return True
+
+
+def get_visible_devices():
+    _visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if _visible_devices is None:
+        # FIXME: We assume that if the variable is unset there is only one GPU
+        visible_devices = ["0"]
+    else:
+        visible_devices = _visible_devices.strip().split(",")
+    return visible_devices
+
+
+def setup_local_dask_cluster(p2p=True):
+    """
+    Performs steps to setup a Dask cluster using LocalCUDACluster and returns
+    the LocalCUDACluster and corresponding client instance.
+    """
+    cluster = LocalCUDACluster()
+    client = Client(cluster)
+    client.wait_for_workers(len(get_visible_devices()))
+    Comms.initialize(p2p=p2p)
+
+    return (cluster, client)
+
+
+def teardown_local_dask_cluster(cluster, client):
+    """
+    Performs steps to destroy a Dask cluster and a corresponding client
+    instance.
+    """
+    Comms.destroy()
+    client.close()
+    cluster.close()
diff --git a/python/cugraph/dask/common/part_utils.py b/python/cugraph/dask/common/part_utils.py
index 505272fa563..2bff490d35c 100644
--- a/python/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/dask/common/part_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,12 +78,18 @@ def persist_distributed_data(dask_df, client):
     return parts
 
 
-async def _extract_partitions(dask_obj, client=None):
-
+async def _extract_partitions(dask_obj, client=None, batch_enabled=False):
     client = default_client() if client is None else client
+    worker_list = Comms.get_workers()
     # dask.dataframe or dask.array
     if isinstance(dask_obj, (daskDataFrame, daskArray, daskSeries)):
-        parts = persist_distributed_data(dask_obj, client)
+        # parts = persist_distributed_data(dask_obj, client)
+        # FIXME: persist data to the same worker when batch_enabled=True
+        if batch_enabled:
+            persisted = client.persist(dask_obj, workers=worker_list[0])
+        else:
+            persisted = client.persist(dask_obj)
+        parts = futures_of(persisted)
     # iterable of dask collections (need to colocate them)
     elif isinstance(dask_obj, collections.Sequence):
         # NOTE: We colocate (X, y) here by zipping delayed
diff --git a/python/cugraph/dask/community/louvain.pxd b/python/cugraph/dask/community/louvain.pxd
index b6b4cd23143..ab990330028 100644
--- a/python/cugraph/dask/community/louvain.pxd
+++ b/python/cugraph/dask/community/louvain.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,10 +17,10 @@
 # cython: language_level = 3
 
 from libcpp.utility cimport pair
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef pair[size_t, weight_t] call_louvain[weight_t](
         const handle_t &handle,
diff --git a/python/cugraph/dask/community/louvain.py b/python/cugraph/dask/community/louvain.py
index 11ecb78375f..c4db00ab27a 100644
--- a/python/cugraph/dask/community/louvain.py
+++ b/python/cugraph/dask/community/louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,8 +16,8 @@
 from dask.distributed import wait, default_client
 
 import cugraph.comms.comms as Comms
-from cugraph.dask.common.input_utils import get_distributed_data
-from cugraph.structure.shuffle import shuffle
+from cugraph.dask.common.input_utils import (get_distributed_data,
+                                             get_vertex_partition_offsets)
 from cugraph.dask.community import louvain_wrapper as c_mg_louvain
 from cugraph.utilities.utils import is_cuda_version_less_than
 
@@ -29,20 +29,21 @@ def call_louvain(sID,
                  num_verts,
                  num_edges,
                  vertex_partition_offsets,
-                 sorted_by_degree,
+                 aggregate_segment_offsets,
                  max_level,
                  resolution):
-
     wid = Comms.get_worker_id(sID)
     handle = Comms.get_handle(sID)
-
+    local_size = len(aggregate_segment_offsets) // Comms.get_n_workers(sID)
+    segment_offsets = \
+        aggregate_segment_offsets[local_size * wid: local_size * (wid + 1)]
     return c_mg_louvain.louvain(data[0],
                                 num_verts,
                                 num_edges,
                                 vertex_partition_offsets,
                                 wid,
                                 handle,
-                                sorted_by_degree,
+                                segment_offsets,
                                 max_level,
                                 resolution)
 
@@ -55,7 +56,8 @@ def louvain(input_graph, max_iter=100, resolution=1.0):
     Examples
     --------
     >>> import cugraph.dask as dcg
-    >>> Comms.initialize(p2p=True)
+    >>> ... Init a DASK Cluster
+    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
     >>> chunksize = dcg.get_chunksize(input_data_path)
     >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                  delimiter=' ',
@@ -83,14 +85,10 @@ def louvain(input_graph, max_iter=100, resolution=1.0):
     client = default_client()
     # Calling renumbering results in data that is sorted by degree
     input_graph.compute_renumber_edge_list(transposed=False)
-    sorted_by_degree = True
-
-    (ddf,
-     num_verts,
-     partition_row_size,
-     partition_col_size,
-     vertex_partition_offsets) = shuffle(input_graph, transposed=False)
 
+    ddf = input_graph.edgelist.edgelist_df
+    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
+    num_verts = vertex_partition_offsets.iloc[-1]
     num_edges = len(ddf)
     data = get_distributed_data(ddf)
 
@@ -100,7 +98,7 @@ def louvain(input_graph, max_iter=100, resolution=1.0):
                              num_verts,
                              num_edges,
                              vertex_partition_offsets,
-                             sorted_by_degree,
+                             input_graph.aggregate_segment_offsets,
                              max_iter,
                              resolution,
                              workers=[wf[0]])
diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx
index c2a12cf81f3..c3c3f1ad373 100644
--- a/python/cugraph/dask/community/louvain_wrapper.pyx
+++ b/python/cugraph/dask/community/louvain_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,10 +17,10 @@
 # cython: language_level = 3
 
 from libc.stdint cimport uintptr_t
+from libcpp.vector cimport vector
 
 from cugraph.dask.community cimport louvain as c_louvain
-from cugraph.structure.graph_primtypes cimport *
-
+from cugraph.structure.graph_utilities cimport *
 import cudf
 import numpy as np
 
@@ -38,7 +38,7 @@ def louvain(input_df,
             vertex_partition_offsets,
             rank,
             handle,
-            sorted_by_degree,
+            segment_offsets,
             max_level,
             resolution):
     """
@@ -57,18 +57,18 @@ def louvain(input_df,
 
     src = input_df['src']
     dst = input_df['dst']
-    num_partition_edges = len(src)
+    num_local_edges = len(src)
 
     if "value" in input_df.columns:
         weights = input_df['value']
     else:
-        weights = cudf.Series(np.full(num_partition_edges, 1.0, dtype=np.float32))
+        weights = cudf.Series(np.full(num_local_edges, 1.0, dtype=np.float32))
 
     vertex_t = src.dtype
     if num_global_edges > (2**31 - 1):
         edge_t = np.dtype("int64")
     else:
-        edge_t = np.dtype("int32")
+        edge_t = vertex_t
     weight_t = weights.dtype
 
     # COO
@@ -83,6 +83,16 @@ def louvain(input_df,
 
     num_local_verts = vertex_partition_offsets_host[rank+1] - vertex_partition_offsets_host[rank]
 
+    cdef vector[int] v_segment_offsets_32
+    cdef vector[long] v_segment_offsets_64
+    cdef uintptr_t c_segment_offsets
+    if (vertex_t == np.dtype("int32")):
+        v_segment_offsets_32 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_32.data()
+    else:
+        v_segment_offsets_64 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_64.data()
+
     cdef graph_container_t graph_container
 
     # FIXME: The excessive casting for the enum arg is needed to make cython
@@ -92,12 +102,15 @@ def louvain(input_df,
                              handle_[0],
                              <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
                              <void*>c_vertex_partition_offsets,
+                             <void*>c_segment_offsets,
+                             len(segment_offsets) - 1,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
-                             sorted_by_degree,
+                             True,
+                             False,
                              False, True)  # store_transposed, multi_gpu
 
     # Create the output dataframe, column lengths must be equal to the number of
diff --git a/python/cugraph/dask/components/__init__.py b/python/cugraph/dask/components/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cugraph/dask/components/connectivity.py b/python/cugraph/dask/components/connectivity.py
new file mode 100644
index 00000000000..81200e7383e
--- /dev/null
+++ b/python/cugraph/dask/components/connectivity.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dask.distributed import wait, default_client
+from cugraph.dask.common.input_utils import (get_distributed_data,
+                                             get_vertex_partition_offsets)
+from cugraph.dask.components import mg_connectivity_wrapper as mg_connectivity
+import cugraph.comms.comms as Comms
+import dask_cudf
+
+
+def call_wcc(sID,
+             data,
+             num_verts,
+             num_edges,
+             vertex_partition_offsets,
+             aggregate_segment_offsets):
+    wid = Comms.get_worker_id(sID)
+    handle = Comms.get_handle(sID)
+    local_size = len(aggregate_segment_offsets) // Comms.get_n_workers(sID)
+    segment_offsets = \
+        aggregate_segment_offsets[local_size * wid: local_size * (wid + 1)]
+    return mg_connectivity.mg_wcc(data[0],
+                                  num_verts,
+                                  num_edges,
+                                  vertex_partition_offsets,
+                                  wid,
+                                  handle,
+                                  segment_offsets)
+
+
+def weakly_connected_components(input_graph):
+
+    client = default_client()
+
+    input_graph.compute_renumber_edge_list()
+
+    ddf = input_graph.edgelist.edgelist_df
+    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
+    num_verts = vertex_partition_offsets.iloc[-1]
+    num_edges = len(ddf)
+    data = get_distributed_data(ddf)
+
+    result = [client.submit(call_wcc,
+                            Comms.get_session_id(),
+                            wf[1],
+                            num_verts,
+                            num_edges,
+                            vertex_partition_offsets,
+                            input_graph.aggregate_segment_offsets,
+                            workers=[wf[0]])
+              for idx, wf in enumerate(data.worker_to_parts.items())]
+    wait(result)
+    ddf = dask_cudf.from_delayed(result)
+
+    if input_graph.renumbered:
+        return input_graph.unrenumber(ddf, 'vertex')
+
+    return ddf
diff --git a/python/cugraph/dask/components/mg_connectivity.pxd b/python/cugraph/dask/components/mg_connectivity.pxd
new file mode 100644
index 00000000000..04f04a9665e
--- /dev/null
+++ b/python/cugraph/dask/components/mg_connectivity.pxd
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cugraph.structure.graph_utilities cimport *
+from libcpp cimport bool
+
+
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef void call_wcc[vertex_t, weight_t](
+        const handle_t &handle,
+        const graph_container_t &g,
+        vertex_t * components)
diff --git a/python/cugraph/dask/components/mg_connectivity_wrapper.pyx b/python/cugraph/dask/components/mg_connectivity_wrapper.pyx
new file mode 100644
index 00000000000..b7bad1b6277
--- /dev/null
+++ b/python/cugraph/dask/components/mg_connectivity_wrapper.pyx
@@ -0,0 +1,111 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cugraph.structure.utils_wrapper import *
+from cugraph.dask.components cimport mg_connectivity as c_connectivity
+import cudf
+from cugraph.structure.graph_utilities cimport *
+import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+import numpy as np
+
+
+def mg_wcc(input_df,
+           num_global_verts,
+           num_global_edges,
+           vertex_partition_offsets,
+           rank,
+           handle,
+           segment_offsets):
+
+    cdef size_t handle_size_t = <size_t>handle.getHandle()
+    handle_ = <c_connectivity.handle_t*>handle_size_t
+
+    src = input_df['src']
+    dst = input_df['dst']
+    vertex_t = src.dtype
+    if num_global_edges > (2**31 - 1):
+        edge_t = np.dtype("int64")
+    else:
+        edge_t = vertex_t
+
+    weights = None
+    weight_t = np.dtype("float32")
+    is_weighted = False
+
+    # FIXME: Offsets and indices are currently hardcoded to int, but this may
+    #        not be acceptable in the future.
+    numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                     np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                     np.dtype("float32") : <int>numberTypeEnum.floatType,
+                     np.dtype("double") : <int>numberTypeEnum.doubleType}
+
+    # FIXME: needs to be edge_t type not int
+    cdef int num_local_edges = len(src)
+
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+
+    # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C
+    vertex_partition_offsets_host = vertex_partition_offsets.values_host
+    cdef uintptr_t c_vertex_partition_offsets = vertex_partition_offsets_host.__array_interface__['data'][0]
+
+    cdef vector[int] v_segment_offsets_32
+    cdef vector[long] v_segment_offsets_64
+    cdef uintptr_t c_segment_offsets
+    if (vertex_t == np.dtype("int32")):
+        v_segment_offsets_32 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_32.data()
+    else:
+        v_segment_offsets_64 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_64.data()
+
+    cdef graph_container_t graph_container
+
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>c_vertex_partition_offsets,
+                             <void*>c_segment_offsets,
+                             len(segment_offsets) - 1,
+                             <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_local_edges,
+                             num_global_verts, num_global_edges,
+                             is_weighted,
+                             True,
+                             False,
+                             True) 
+
+    df = cudf.DataFrame()
+    df['vertex'] = cudf.Series(np.arange(vertex_partition_offsets.iloc[rank], vertex_partition_offsets.iloc[rank+1]), dtype=vertex_t)
+    df['labels'] = cudf.Series(np.zeros(len(df['vertex']), dtype=vertex_t))
+
+    cdef uintptr_t c_labels_val = df['labels'].__cuda_array_interface__['data'][0];
+
+    if vertex_t == np.int32:    
+        c_connectivity.call_wcc[int, float](handle_[0],
+                                            graph_container,
+                                            <int*>c_labels_val)
+    else:
+        c_connectivity.call_wcc[long, float](handle_[0],
+                                             graph_container,
+                                             <long*>c_labels_val)
+
+    return df
diff --git a/python/cugraph/dask/link_analysis/mg_pagerank.pxd b/python/cugraph/dask/link_analysis/mg_pagerank.pxd
index 91104d9127c..4b47f43dd87 100644
--- a/python/cugraph/dask/link_analysis/mg_pagerank.pxd
+++ b/python/cugraph/dask/link_analysis/mg_pagerank.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
 # limitations under the License.
 #
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_pagerank[vertex_t, weight_t](
         const handle_t &handle,
@@ -31,4 +31,4 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
         double alpha,
         double tolerance,
         long long max_iter,
-        bool has_guess) except +
\ No newline at end of file
+        bool has_guess) except +
diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
index 1cd80397b17..43ef3d7e31f 100644
--- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
+++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from cugraph.structure.utils_wrapper import *
 from cugraph.dask.link_analysis cimport mg_pagerank as c_pagerank
 import cudf
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
@@ -30,6 +30,7 @@ def mg_pagerank(input_df,
                 vertex_partition_offsets,
                 rank,
                 handle,
+                segment_offsets,
                 alpha=0.85,
                 max_iter=100,
                 tol=1.0e-5,
@@ -47,12 +48,16 @@ def mg_pagerank(input_df,
     if num_global_edges > (2**31 - 1):
         edge_t = np.dtype("int64")
     else:
-        edge_t = np.dtype("int32")
+        edge_t = vertex_t
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
+        is_weighted = True
+        raise NotImplementedError # FIXME: c_edge_weights is always set to NULL
     else:
+        weights = None
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
@@ -62,28 +67,43 @@ def mg_pagerank(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+      c_edge_weights = weights.__cuda_array_interface__['data'][0]
     
     # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C
     vertex_partition_offsets_host = vertex_partition_offsets.values_host
     cdef uintptr_t c_vertex_partition_offsets = vertex_partition_offsets_host.__array_interface__['data'][0]
 
+    cdef vector[int] v_segment_offsets_32
+    cdef vector[long] v_segment_offsets_64
+    cdef uintptr_t c_segment_offsets
+    if (vertex_t == np.dtype("int32")):
+        v_segment_offsets_32 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_32.data()
+    else:
+        v_segment_offsets_64 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_64.data()
+
     cdef graph_container_t graph_container
 
     populate_graph_container(graph_container,
                              handle_[0],
                              <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
                              <void*>c_vertex_partition_offsets,
+                             <void*>c_segment_offsets,
+                             len(segment_offsets) - 1,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
-                             True,
+                             is_weighted,
+                             False,
                              True, True) 
 
     df = cudf.DataFrame()
@@ -104,11 +124,19 @@ def mg_pagerank(input_df,
         c_pers_vtx = personalization['vertex'].__cuda_array_interface__['data'][0]
         c_pers_val = personalization['values'].__cuda_array_interface__['data'][0]
 
-    if (df['pagerank'].dtype == np.float32):
-        c_pagerank.call_pagerank[int, float](handle_[0], graph_container, <int*>c_identifier, <float*> c_pagerank_val, sz, <int*> c_pers_vtx, <float*> c_pers_val,
-                                 <float> alpha, <float> tol, <int> max_iter, <bool> 0)
+    if vertex_t == np.int32:
+        if (df['pagerank'].dtype == np.float32):
+            c_pagerank.call_pagerank[int, float](handle_[0], graph_container, <int*>c_identifier, <float*> c_pagerank_val, sz, <int*> c_pers_vtx, <float*> c_pers_val,
+                                     <float> alpha, <float> tol, <int> max_iter, <bool> 0)
+        else:
+            c_pagerank.call_pagerank[int, double](handle_[0], graph_container, <int*>c_identifier, <double*> c_pagerank_val, sz, <int*> c_pers_vtx, <double*> c_pers_val,
+                                     <float> alpha, <float> tol, <int> max_iter, <bool> 0)
     else:
-        c_pagerank.call_pagerank[int, double](handle_[0], graph_container, <int*>c_identifier, <double*> c_pagerank_val, sz, <int*> c_pers_vtx, <double*> c_pers_val,
-                            <float> alpha, <float> tol, <int> max_iter, <bool> 0)
-    
+        if (df['pagerank'].dtype == np.float32):
+            c_pagerank.call_pagerank[long, float](handle_[0], graph_container, <long*>c_identifier, <float*> c_pagerank_val, sz, <long*> c_pers_vtx, <float*> c_pers_val,
+                                     <float> alpha, <float> tol, <int> max_iter, <bool> 0)
+        else:
+            c_pagerank.call_pagerank[long, double](handle_[0], graph_container, <long*>c_identifier, <double*> c_pagerank_val, sz, <long*> c_pers_vtx, <double*> c_pers_val,
+                                     <float> alpha, <float> tol, <int> max_iter, <bool> 0)
+
     return df
diff --git a/python/cugraph/dask/link_analysis/pagerank.py b/python/cugraph/dask/link_analysis/pagerank.py
index 1e9d79e0aa6..87ef94d1600 100644
--- a/python/cugraph/dask/link_analysis/pagerank.py
+++ b/python/cugraph/dask/link_analysis/pagerank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 #
 
 from dask.distributed import wait, default_client
-from cugraph.dask.common.input_utils import get_distributed_data
-from cugraph.structure.shuffle import shuffle
+from cugraph.dask.common.input_utils import (get_distributed_data,
+                                             get_vertex_partition_offsets)
 from cugraph.dask.link_analysis import mg_pagerank_wrapper as mg_pagerank
 import cugraph.comms.comms as Comms
 import dask_cudf
@@ -26,6 +26,7 @@ def call_pagerank(sID,
                   num_verts,
                   num_edges,
                   vertex_partition_offsets,
+                  aggregate_segment_offsets,
                   alpha,
                   max_iter,
                   tol,
@@ -33,12 +34,16 @@ def call_pagerank(sID,
                   nstart):
     wid = Comms.get_worker_id(sID)
     handle = Comms.get_handle(sID)
+    local_size = len(aggregate_segment_offsets) // Comms.get_n_workers(sID)
+    segment_offsets = \
+        aggregate_segment_offsets[local_size * wid: local_size * (wid + 1)]
     return mg_pagerank.mg_pagerank(data[0],
                                    num_verts,
                                    num_edges,
                                    vertex_partition_offsets,
                                    wid,
                                    handle,
+                                   segment_offsets,
                                    alpha,
                                    max_iter,
                                    tol,
@@ -73,6 +78,7 @@ def pagerank(input_graph,
     personalization : cudf.Dataframe
         GPU Dataframe containing the personalization information.
         Currently not supported.
+
         personalization['vertex'] : cudf.Series
             Subset of vertices of graph for personalization
         personalization['values'] : cudf.Series
@@ -91,6 +97,7 @@ def pagerank(input_graph,
         acceptable.
     nstart : not supported
         initial guess for pagerank
+
     Returns
     -------
     PageRank : dask_cudf.DataFrame
@@ -105,7 +112,8 @@ def pagerank(input_graph,
     Examples
     --------
     >>> import cugraph.dask as dcg
-    >>> Comms.initialize(p2p=True)
+    >>> ... Init a DASK Cluster
+    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
     >>> chunksize = dcg.get_chunksize(input_data_path)
     >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                  delimiter=' ',
@@ -115,20 +123,18 @@ def pagerank(input_graph,
     >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                    edge_attr='value')
     >>> pr = dcg.pagerank(dg)
-    >>> Comms.destroy()
     """
-    from cugraph.structure.graph import null_check
+    from cugraph.structure.graph_classes import null_check
 
     nstart = None
 
     client = default_client()
 
     input_graph.compute_renumber_edge_list(transposed=True)
-    (ddf,
-     num_verts,
-     partition_row_size,
-     partition_col_size,
-     vertex_partition_offsets) = shuffle(input_graph, transposed=True)
+
+    ddf = input_graph.edgelist.edgelist_df
+    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
+    num_verts = vertex_partition_offsets.iloc[-1]
     num_edges = len(ddf)
     data = get_distributed_data(ddf)
 
@@ -147,6 +153,7 @@ def pagerank(input_graph,
                                 num_verts,
                                 num_edges,
                                 vertex_partition_offsets,
+                                input_graph.aggregate_segment_offsets,
                                 alpha,
                                 max_iter,
                                 tol,
@@ -161,6 +168,7 @@ def pagerank(input_graph,
                                 num_verts,
                                 num_edges,
                                 vertex_partition_offsets,
+                                input_graph.aggregate_segment_offsets,
                                 alpha,
                                 max_iter,
                                 tol,
diff --git a/python/cugraph/dask/structure/replication.pyx b/python/cugraph/dask/structure/replication.pyx
index 6d579e126bf..417300f806f 100644
--- a/python/cugraph/dask/structure/replication.pyx
+++ b/python/cugraph/dask/structure/replication.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -38,7 +38,7 @@ def replicate_cudf_dataframe(cudf_dataframe, client=None, comms=None):
     dask_cudf_df = dask_cudf.from_cudf(cudf_dataframe, npartitions=1)
     df_length = len(dask_cudf_df)
 
-    _df_data =  get_mg_batch_data(dask_cudf_df)
+    _df_data =  get_mg_batch_data(dask_cudf_df, batch_enabled=True)
     df_data =  mg_utils.prepare_worker_to_parts(_df_data, client)
 
     workers_to_futures = {worker: client.submit(_replicate_cudf_dataframe,
@@ -90,7 +90,7 @@ def replicate_cudf_series(cudf_series, client=None, comms=None):
     dask_cudf_series =  dask_cudf.from_cudf(cudf_series,
                                             npartitions=1)
     series_length = len(dask_cudf_series)
-    _series_data = get_mg_batch_data(dask_cudf_series)
+    _series_data = get_mg_batch_data(dask_cudf_series, batch_enabled=True)
     series_data = mg_utils.prepare_worker_to_parts(_series_data)
 
     dtype = cudf_series.dtype
diff --git a/python/cugraph/dask/traversal/bfs.py b/python/cugraph/dask/traversal/bfs.py
index 7a2c50a3bc0..06cbf64782a 100644
--- a/python/cugraph/dask/traversal/bfs.py
+++ b/python/cugraph/dask/traversal/bfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 #
 
 from dask.distributed import wait, default_client
-from cugraph.dask.common.input_utils import get_distributed_data
-from cugraph.structure.shuffle import shuffle
+from cugraph.dask.common.input_utils import (get_distributed_data,
+                                             get_vertex_partition_offsets)
 from cugraph.dask.traversal import mg_bfs_wrapper as mg_bfs
 import cugraph.comms.comms as Comms
 import cudf
@@ -27,23 +27,31 @@ def call_bfs(sID,
              num_verts,
              num_edges,
              vertex_partition_offsets,
+             aggregate_segment_offsets,
              start,
+             depth_limit,
              return_distances):
     wid = Comms.get_worker_id(sID)
     handle = Comms.get_handle(sID)
+    local_size = len(aggregate_segment_offsets) // Comms.get_n_workers(sID)
+    segment_offsets = \
+        aggregate_segment_offsets[local_size * wid: local_size * (wid + 1)]
     return mg_bfs.mg_bfs(data[0],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          wid,
                          handle,
+                         segment_offsets,
                          start,
+                         depth_limit,
                          return_distances)
 
 
 def bfs(graph,
         start,
-        return_distances=False):
+        depth_limit=None,
+        return_distances=True):
     """
     Find the distances and predecessors for a breadth first traversal of a
     graph.
@@ -59,7 +67,9 @@ def bfs(graph,
     start : Integer
         Specify starting vertex for breadth-first search; this function
         iterates over edges in the component reachable from this node.
-    return_distances : bool, optional, default=False
+    depth_limit : Integer or None
+        Limit the depth of the search
+    return_distances : bool, optional, default=True
         Indicates if distances should be returned
 
     Returns
@@ -76,7 +86,8 @@ def bfs(graph,
     Examples
     --------
     >>> import cugraph.dask as dcg
-    >>> Comms.initialize(p2p=True)
+    >>> ... Init a DASK Cluster
+    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
     >>> chunksize = dcg.get_chunksize(input_data_path)
     >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                  delimiter=' ',
@@ -85,24 +96,28 @@ def bfs(graph,
     >>> dg = cugraph.DiGraph()
     >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
     >>> df = dcg.bfs(dg, 0)
-    >>> Comms.destroy()
     """
 
     client = default_client()
 
     graph.compute_renumber_edge_list(transposed=False)
-    (ddf,
-     num_verts,
-     partition_row_size,
-     partition_col_size,
-     vertex_partition_offsets) = shuffle(graph, transposed=False)
+    ddf = graph.edgelist.edgelist_df
+    vertex_partition_offsets = get_vertex_partition_offsets(graph)
+    num_verts = vertex_partition_offsets.iloc[-1]
+
     num_edges = len(ddf)
     data = get_distributed_data(ddf)
 
     if graph.renumbered:
-        start = graph.lookup_internal_vertex_id(cudf.Series([start],
-                                                dtype='int32')).compute()
-        start = start.iloc[0]
+        if isinstance(start, dask_cudf.DataFrame)\
+          or isinstance(start, cudf.DataFrame):
+            start = graph.lookup_internal_vertex_id(start, start.columns).\
+                    compute()
+            start = start.iloc[0]
+        else:
+            start = graph.lookup_internal_vertex_id(cudf.Series([start])
+                                                    ).compute()
+            start = start.iloc[0]
 
     result = [client.submit(
               call_bfs,
@@ -111,7 +126,9 @@ def bfs(graph,
               num_verts,
               num_edges,
               vertex_partition_offsets,
+              graph.aggregate_segment_offsets,
               start,
+              depth_limit,
               return_distances,
               workers=[wf[0]])
               for idx, wf in enumerate(data.worker_to_parts.items())]
@@ -121,5 +138,5 @@ def bfs(graph,
     if graph.renumbered:
         ddf = graph.unrenumber(ddf, 'vertex')
         ddf = graph.unrenumber(ddf, 'predecessor')
-        ddf["predecessor"] = ddf["predecessor"].fillna(-1)
+        ddf = ddf.fillna(-1)
     return ddf
diff --git a/python/cugraph/dask/traversal/mg_bfs.pxd b/python/cugraph/dask/traversal/mg_bfs.pxd
index 82c6e97d668..d4f399bf689 100644
--- a/python/cugraph/dask/traversal/mg_bfs.pxd
+++ b/python/cugraph/dask/traversal/mg_bfs.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
 # limitations under the License.
 #
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
+cdef extern from "limits.h":
+    cdef int INT_MAX
+    cdef long LONG_MAX
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_bfs[vertex_t, weight_t](
         const handle_t &handle,
@@ -26,6 +29,6 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
         vertex_t *identifiers,
         vertex_t *distances,
         vertex_t *predecessors,
-        double *sp_counters,
+        vertex_t depth_limit,
         const vertex_t start_vertex,
-        bool directed) except +
+        bool direction_optimizing) except +
diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
index c92f28eb407..a15d6704ac8 100644
--- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
+++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from cugraph.structure.utils_wrapper import *
 from cugraph.dask.traversal cimport mg_bfs as c_bfs
 import cudf
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
@@ -27,12 +27,13 @@ def mg_bfs(input_df,
            vertex_partition_offsets,
            rank,
            handle,
+           segment_offsets,
            start,
+           depth_limit,
            return_distances=False):
     """
-    Call pagerank
+    Call BFS
     """
-
     cdef size_t handle_size_t = <size_t>handle.getHandle()
     handle_ = <c_bfs.handle_t*>handle_size_t
 
@@ -43,13 +44,14 @@ def mg_bfs(input_df,
     if num_global_edges > (2**31 - 1):
         edge_t = np.dtype("int64")
     else:
-        edge_t = np.dtype("int32")
+        edge_t = vertex_t
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
     else:
         weight_t = np.dtype("float32")
 
+
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
     numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
@@ -58,7 +60,7 @@ def mg_bfs(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
@@ -68,26 +70,39 @@ def mg_bfs(input_df,
     vertex_partition_offsets_host = vertex_partition_offsets.values_host
     cdef uintptr_t c_vertex_partition_offsets = vertex_partition_offsets_host.__array_interface__['data'][0]
 
+    cdef vector[int] v_segment_offsets_32
+    cdef vector[long] v_segment_offsets_64
+    cdef uintptr_t c_segment_offsets
+    if (vertex_t == np.dtype("int32")):
+        v_segment_offsets_32 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_32.data()
+    else:
+        v_segment_offsets_64 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_64.data()
+
     cdef graph_container_t graph_container
 
     populate_graph_container(graph_container,
                              handle_[0],
                              <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
                              <void*>c_vertex_partition_offsets,
+                             <void*>c_segment_offsets,
+                             len(segment_offsets) - 1,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
-                             True,
+                             False, # BFS runs on unweighted graphs
+                             False,
                              False, True) 
 
     # Generate the cudf.DataFrame result
     df = cudf.DataFrame()
     df['vertex'] = cudf.Series(np.arange(vertex_partition_offsets.iloc[rank], vertex_partition_offsets.iloc[rank+1]), dtype=vertex_t)
-    df['predecessor'] = cudf.Series(np.zeros(len(df['vertex']), dtype=np.int32))
+    df['predecessor'] = cudf.Series(np.zeros(len(df['vertex']), dtype=vertex_t))
     if (return_distances):
-        df['distance'] = cudf.Series(np.zeros(len(df['vertex']), dtype=np.int32))
+        df['distance'] = cudf.Series(np.zeros(len(df['vertex']), dtype=vertex_t))
 
     # Associate <uintptr_t> to cudf Series
     cdef uintptr_t c_distance_ptr    = <uintptr_t> NULL # Pointer to the DataFrame 'distance' Series
@@ -95,14 +110,28 @@ def mg_bfs(input_df,
     if (return_distances):
         c_distance_ptr = df['distance'].__cuda_array_interface__['data'][0]
 
-    cdef bool direction = <bool> 1
-    # MG BFS path assumes directed is true
-    c_bfs.call_bfs[int, float](handle_[0],
-                               graph_container,
-                               <int*> NULL,
-                               <int*> c_distance_ptr,
-                               <int*> c_predecessor_ptr,
-                               <double*> NULL,
-                               <int> start,
-                               direction)
+    cdef bool direction_optimizing = <bool> 0
+
+    if vertex_t == np.int32:    
+        if depth_limit is None:
+            depth_limit = c_bfs.INT_MAX
+        c_bfs.call_bfs[int, float](handle_[0],
+                                   graph_container,
+                                   <int*> NULL,
+                                   <int*> c_distance_ptr,
+                                   <int*> c_predecessor_ptr,
+                                   <int> depth_limit,
+                                   <int> start,
+                                   direction_optimizing)
+    else:
+        if depth_limit is None:
+            depth_limit = c_bfs.LONG_MAX
+        c_bfs.call_bfs[long, float](handle_[0],
+                                   graph_container,
+                                   <long*> NULL,
+                                   <long*> c_distance_ptr,
+                                   <long*> c_predecessor_ptr,
+                                   <long> depth_limit,
+                                   <long> start,
+                                   direction_optimizing)
     return df
diff --git a/python/cugraph/dask/traversal/mg_sssp.pxd b/python/cugraph/dask/traversal/mg_sssp.pxd
index f846facd269..937b42147e6 100644
--- a/python/cugraph/dask/traversal/mg_sssp.pxd
+++ b/python/cugraph/dask/traversal/mg_sssp.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_sssp[vertex_t, weight_t](
         const handle_t &handle,
diff --git a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx
index b7aec103098..63ac2942cfa 100644
--- a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx
+++ b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from cugraph.structure.utils_wrapper import *
 from cugraph.dask.traversal cimport mg_sssp as c_sssp
 import cudf
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
@@ -27,6 +27,7 @@ def mg_sssp(input_df,
             vertex_partition_offsets,
             rank,
             handle,
+            segment_offsets,
             start):
     """
     Call sssp
@@ -42,13 +43,15 @@ def mg_sssp(input_df,
     if num_global_edges > (2**31 - 1):
         edge_t = np.dtype("int64")
     else:
-        edge_t = np.dtype("int32")
+        edge_t = vertex_t
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
+        is_weighted = True
     else:
         weights = None
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
@@ -58,7 +61,7 @@ def mg_sssp(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
@@ -70,18 +73,31 @@ def mg_sssp(input_df,
     vertex_partition_offsets_host = vertex_partition_offsets.values_host
     cdef uintptr_t c_vertex_partition_offsets = vertex_partition_offsets_host.__array_interface__['data'][0]
 
+    cdef vector[int] v_segment_offsets_32
+    cdef vector[long] v_segment_offsets_64
+    cdef uintptr_t c_segment_offsets
+    if (vertex_t == np.dtype("int32")):
+        v_segment_offsets_32 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_32.data()
+    else:
+        v_segment_offsets_64 = segment_offsets
+        c_segment_offsets = <uintptr_t>v_segment_offsets_64.data()
+
     cdef graph_container_t graph_container
 
     populate_graph_container(graph_container,
                              handle_[0],
                              <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
                              <void*>c_vertex_partition_offsets,
+                             <void*>c_segment_offsets,
+                             len(segment_offsets) - 1,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
-                             True,
+                             is_weighted,
+                             False,
                              False, True) 
 
     # Generate the cudf.DataFrame result
@@ -95,21 +111,34 @@ def mg_sssp(input_df,
     cdef uintptr_t c_distance_ptr = df['distance'].__cuda_array_interface__['data'][0]
 
     # MG BFS path assumes directed is true
-    if weight_t == np.float32:
-        c_sssp.call_sssp[int, float](handle_[0],
-                                     graph_container,
-                                     <int*> NULL,
-                                     <float*> c_distance_ptr,
-                                     <int*> c_predecessor_ptr,
-                                     <int> start)
-    elif weight_t == np.float64:
-        c_sssp.call_sssp[int, double](handle_[0],
-                                      graph_container,
-                                      <int*> NULL,
-                                      <double*> c_distance_ptr,
-                                      <int*> c_predecessor_ptr,
-                                      <int> start)
-    else: # This case should not happen
-        raise NotImplementedError
-
+    if vertex_t == np.int32:
+        if weight_t == np.float32:
+            c_sssp.call_sssp[int, float](handle_[0],
+                                         graph_container,
+                                         <int*> NULL,
+                                         <float*> c_distance_ptr,
+                                         <int*> c_predecessor_ptr,
+                                         <int> start)
+        elif weight_t == np.float64:
+            c_sssp.call_sssp[int, double](handle_[0],
+                                          graph_container,
+                                          <int*> NULL,
+                                          <double*> c_distance_ptr,
+                                          <int*> c_predecessor_ptr,
+                                          <int> start)
+    else:
+        if weight_t == np.float32:
+            c_sssp.call_sssp[long, float](handle_[0],
+                                         graph_container,
+                                         <long*> NULL,
+                                         <float*> c_distance_ptr,
+                                         <long*> c_predecessor_ptr,
+                                         <long> start)
+        elif weight_t == np.float64:
+            c_sssp.call_sssp[long, double](handle_[0],
+                                          graph_container,
+                                          <long*> NULL,
+                                          <double*> c_distance_ptr,
+                                          <long*> c_predecessor_ptr,
+                                          <long> start)
     return df
diff --git a/python/cugraph/dask/traversal/sssp.py b/python/cugraph/dask/traversal/sssp.py
index ce0c7908664..fbaee901d65 100644
--- a/python/cugraph/dask/traversal/sssp.py
+++ b/python/cugraph/dask/traversal/sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 #
 
 from dask.distributed import wait, default_client
-from cugraph.dask.common.input_utils import get_distributed_data
-from cugraph.structure.shuffle import shuffle
+from cugraph.dask.common.input_utils import (get_distributed_data,
+                                             get_vertex_partition_offsets)
 from cugraph.dask.traversal import mg_sssp_wrapper as mg_sssp
 import cugraph.comms.comms as Comms
 import cudf
@@ -27,15 +27,20 @@ def call_sssp(sID,
               num_verts,
               num_edges,
               vertex_partition_offsets,
+              aggregate_segment_offsets,
               start):
     wid = Comms.get_worker_id(sID)
     handle = Comms.get_handle(sID)
+    local_size = len(aggregate_segment_offsets) // Comms.get_n_workers(sID)
+    segment_offsets = \
+        aggregate_segment_offsets[local_size * wid: local_size * (wid + 1)]
     return mg_sssp.mg_sssp(data[0],
                            num_verts,
                            num_edges,
                            vertex_partition_offsets,
                            wid,
                            handle,
+                           segment_offsets,
                            start)
 
 
@@ -76,7 +81,8 @@ def sssp(graph,
     Examples
     --------
     >>> import cugraph.dask as dcg
-    >>> Comms.initialize(p2p=True)
+    >>> ... Init a DASK Cluster
+    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
     >>> chunksize = dcg.get_chunksize(input_data_path)
     >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                  delimiter=' ',
@@ -85,23 +91,20 @@ def sssp(graph,
     >>> dg = cugraph.DiGraph()
     >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
     >>> df = dcg.sssp(dg, 0)
-    >>> Comms.destroy()
     """
 
     client = default_client()
 
     graph.compute_renumber_edge_list(transposed=False)
-    (ddf,
-     num_verts,
-     partition_row_size,
-     partition_col_size,
-     vertex_partition_offsets) = shuffle(graph, transposed=False)
+    ddf = graph.edgelist.edgelist_df
+    vertex_partition_offsets = get_vertex_partition_offsets(graph)
+    num_verts = vertex_partition_offsets.iloc[-1]
     num_edges = len(ddf)
     data = get_distributed_data(ddf)
 
     if graph.renumbered:
-        source = graph.lookup_internal_vertex_id(cudf.Series([source],
-                                                 dtype='int32')).compute()
+        source = graph.lookup_internal_vertex_id(cudf.Series([source])
+                                                 ).compute()
         source = source.iloc[0]
 
     result = [client.submit(
@@ -111,6 +114,7 @@ def sssp(graph,
               num_verts,
               num_edges,
               vertex_partition_offsets,
+              graph.aggregate_segment_offsets,
               source,
               workers=[wf[0]])
               for idx, wf in enumerate(data.worker_to_parts.items())]
diff --git a/python/cugraph/bsp/__init__.py b/python/cugraph/generators/__init__.py
similarity index 85%
rename from python/cugraph/bsp/__init__.py
rename to python/cugraph/generators/__init__.py
index dbb94895cec..74ecc2384bd 100644
--- a/python/cugraph/bsp/__init__.py
+++ b/python/cugraph/generators/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,5 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.bsp.traversal import bfs_df_pregel
-
+from .rmat import rmat, multi_rmat
diff --git a/python/cugraph/generators/rmat.pxd b/python/cugraph/generators/rmat.pxd
new file mode 100644
index 00000000000..3c51108c778
--- /dev/null
+++ b/python/cugraph/generators/rmat.pxd
@@ -0,0 +1,45 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from libcpp cimport bool
+from cugraph.structure.graph_utilities cimport *
+from libcpp.vector cimport vector
+
+cdef extern from "cugraph/graph_generators.hpp" namespace "cugraph":
+    ctypedef enum generator_distribution_t:
+        POWER_LAW "cugraph::generator_distribution_t::POWER_LAW"
+        UNIFORM "cugraph::generator_distribution_t::UNIFORM"
+
+
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+    cdef unique_ptr[graph_generator_t] call_generate_rmat_edgelist[vertex_t] (
+        const handle_t &handle,
+        size_t scale,
+        size_t num_edges,
+        double a,
+        double b,
+        double c,
+        int seed,
+        bool clip_and_flip,
+        bool scramble_vertex_ids) except +
+
+    cdef vector[pair[unique_ptr[device_buffer], unique_ptr[device_buffer]]] call_generate_rmat_edgelists[vertex_t](
+        const handle_t &handle,
+        size_t n_edgelists,
+        size_t min_scale,
+        size_t max_scale,
+        size_t edge_factor,
+        generator_distribution_t size_distribution,
+        generator_distribution_t edge_distribution,
+        int seed,
+        bool clip_and_flip,
+        bool scramble_vertex_ids) except +
diff --git a/python/cugraph/generators/rmat.py b/python/cugraph/generators/rmat.py
new file mode 100644
index 00000000000..d93ceb34cd1
--- /dev/null
+++ b/python/cugraph/generators/rmat.py
@@ -0,0 +1,396 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dask.distributed import default_client
+import dask_cudf
+
+from cugraph.generators import rmat_wrapper
+from cugraph.comms import comms as Comms
+import cugraph
+
+
+def _ensure_args_rmat(
+    scale,
+    num_edges,
+    a,
+    b,
+    c,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids,
+    create_using,
+    mg
+):
+    """
+    Ensures the args passed in are usable for the rmat() API, raises the
+    appropriate exception if incorrect, else returns None.
+    """
+    if mg and create_using not in [None, cugraph.DiGraph]:
+        raise TypeError("Only cugraph.DiGraph and None are supported types "
+                        "for `create_using` for multi-GPU R-MAT")
+    if create_using not in [None, cugraph.Graph, cugraph.DiGraph]:
+        raise TypeError("Only cugraph.Graph, cugraph.DiGraph, and None are "
+                        "supported types for 'create_using'")
+    if not isinstance(scale, int):
+        raise TypeError("'scale' must be an int")
+    if not isinstance(num_edges, int):
+        raise TypeError("'num_edges' must be an int")
+    if (a+b+c > 1):
+        raise ValueError(
+            "a + b + c should be non-negative and no larger than 1.0")
+    if (clip_and_flip not in [True, False]):
+        raise ValueError("'clip_and_flip' must be a bool")
+    if (scramble_vertex_ids not in [True, False]):
+        raise ValueError("'clip_and_flip' must be a bool")
+    if not isinstance(seed, int):
+        raise TypeError("'seed' must be an int")
+
+
+def _ensure_args_multi_rmat(
+    n_edgelists,
+    min_scale,
+    max_scale,
+    edge_factor,
+    size_distribution,
+    edge_distribution,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids
+):
+    """
+    Ensures the args passed in are usable for the multi_rmat() API, raises the
+    appropriate exception if incorrect, else returns None.
+    """
+    if not isinstance(n_edgelists, int):
+        raise TypeError("'n_edgelists' must be an int")
+    if not isinstance(min_scale, int):
+        raise TypeError("'min_scale' must be an int")
+    if not isinstance(max_scale, int):
+        raise TypeError("'max_scale' must be an int")
+    if not isinstance(edge_factor, int):
+        raise TypeError("'edge_factor' must be an int")
+    if (size_distribution not in [0, 1]):
+        raise TypeError("'size_distribution' must be either 0 or 1")
+    if (edge_distribution not in [0, 1]):
+        raise TypeError("'edge_distribution' must be either 0 or 1")
+    if (clip_and_flip not in [True, False]):
+        raise ValueError("'clip_and_flip' must be a bool")
+    if (scramble_vertex_ids not in [True, False]):
+        raise ValueError("'clip_and_flip' must be a bool")
+    if not isinstance(seed, int):
+        raise TypeError("'seed' must be an int")
+
+
+def _sg_rmat(
+    scale,
+    num_edges,
+    a,
+    b,
+    c,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids,
+    create_using=cugraph.DiGraph
+):
+    """
+    Calls RMAT on a single GPU and uses the resulting cuDF DataFrame
+    to initialize and return a cugraph Graph object specified with
+    create_using. If create_using is None, returns the edgelist df as-is.
+    """
+    df = rmat_wrapper.generate_rmat_edgelist(scale,
+                                             num_edges,
+                                             a,
+                                             b,
+                                             c,
+                                             seed,
+                                             clip_and_flip,
+                                             scramble_vertex_ids)
+    if create_using is None:
+        return df
+
+    G = create_using()
+    G.from_cudf_edgelist(df, source='src', destination='dst', renumber=False)
+
+    return G
+
+
+def _mg_rmat(
+    scale,
+    num_edges,
+    a,
+    b,
+    c,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids,
+    create_using=cugraph.DiGraph
+):
+    """
+    Calls RMAT on multiple GPUs and uses the resulting Dask cuDF DataFrame to
+    initialize and return a cugraph Graph object specified with create_using.
+    If create_using is None, returns the Dask DataFrame edgelist as-is.
+
+    seed is used as the initial seed for the first worker used (worker 0), then
+    each subsequent worker will receive seed+<worker num> as the seed value.
+    """
+    client = default_client()
+    worker_list = list(client.scheduler_info()['workers'].keys())
+    num_workers = len(worker_list)
+    num_edges_list = _calc_num_edges_per_worker(num_workers, num_edges)
+    futures = []
+    for (i, worker_num_edges) in enumerate(num_edges_list):
+        unique_worker_seed = seed + i
+        future = client.submit(
+            _call_rmat,
+            Comms.get_session_id(),
+            scale,
+            worker_num_edges,
+            a,
+            b,
+            c,
+            unique_worker_seed,
+            clip_and_flip,
+            scramble_vertex_ids,
+            workers=worker_list[i]
+        )
+        futures.append(future)
+
+    ddf = dask_cudf.from_delayed(futures)
+
+    if create_using is None:
+        return ddf
+
+    G = create_using()
+    G.from_dask_cudf_edgelist(ddf, source="src", destination="dst")
+
+    return G
+
+
+def _call_rmat(
+    sID,
+    scale,
+    num_edges_for_worker,
+    a,
+    b,
+    c,
+    unique_worker_seed,
+    clip_and_flip,
+    scramble_vertex_ids
+):
+    """
+    Callable passed to dask client.submit calls that extracts the individual
+    worker handle based on the dask session ID
+    """
+    handle = Comms.get_handle(sID)
+
+    return rmat_wrapper.generate_rmat_edgelist(
+        scale,
+        num_edges_for_worker,
+        a,
+        b,
+        c,
+        unique_worker_seed,
+        clip_and_flip,
+        scramble_vertex_ids,
+        handle=handle
+    )
+
+
+def _calc_num_edges_per_worker(num_workers, num_edges):
+    """
+    Returns a list of length num_workers with the individual number of edges
+    each worker should generate. The sum of all edges in the list is num_edges.
+    """
+    L = []
+    w = num_edges // num_workers
+    r = num_edges % num_workers
+    for i in range(num_workers):
+        if (i < r):
+            L.append(w+1)
+        else:
+            L.append(w)
+    return L
+
+
+###############################################################################
+
+def rmat(
+    scale,
+    num_edges,
+    a,
+    b,
+    c,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids,
+    create_using=cugraph.DiGraph,
+    mg=False
+):
+    """
+    Generate a Graph object using a Recursive MATrix (R-MAT) graph generation
+    algorithm.
+
+    Parameters
+    ----------
+    scale : int
+        Scale factor to set the number of vertices in the graph Vertex IDs have
+        values in [0, V), where V = 1 << 'scale'
+
+    num_edges : int
+        Number of edges to generate
+
+    a : float
+        Probability of the first partition
+
+    b : float
+        Probability of the second partition
+
+    c : float
+        Probability of the thrid partition
+
+    seed : int
+        Seed value for the random number generator
+
+    clip_and_flip : bool
+        Flag controlling whether to generate edges only in the lower triangular
+        part (including the diagonal) of the graph adjacency matrix
+        (if set to 'true') or not (if set to 'false).
+
+    scramble_vertex_ids : bool
+        Flag controlling whether to scramble vertex ID bits (if set to `true`)
+        or not (if set to `false`); scrambling vertex ID bits breaks
+        correlation between vertex ID values and vertex degrees.
+
+    create_using : cugraph Graph type or None The graph type to construct
+        containing the generated edges and vertices.  If None is specified, the
+        edgelist cuDF DataFrame (or dask_cudf DataFrame for MG) is returned
+        as-is. This is useful for benchmarking Graph construction steps that
+        require raw data that includes potential self-loops, isolated vertices,
+        and duplicated edges.  Default is cugraph.DiGraph.
+        NOTE: only the cugraph.DiGraph type is supported for multi-GPU
+
+    mg : bool
+        If True, R-MAT generation occurs across multiple GPUs. If False, only a
+        single GPU is used.  Default is False (single-GPU)
+
+    Returns
+    -------
+    instance of cugraph.Graph
+
+    Examples
+    --------
+    import cugraph
+    from cugraph.generators import rmat
+
+    df = rmat(
+        scale,
+        (2**scale)*edgefactor,
+        0.1,
+        0.2,
+        0.3,
+        seed or 42,
+        clip_and_flip=False,
+        scramble_vertex_ids=True,
+        create_using=None,  # return edgelist instead of Graph instance
+        mg=False
+    )
+
+    """
+    _ensure_args_rmat(scale, num_edges, a, b, c, seed, clip_and_flip,
+                      scramble_vertex_ids, create_using, mg)
+
+    if mg:
+        return _mg_rmat(scale, num_edges, a, b, c, seed, clip_and_flip,
+                        scramble_vertex_ids, create_using)
+    else:
+        return _sg_rmat(scale, num_edges, a, b, c, seed, clip_and_flip,
+                        scramble_vertex_ids, create_using)
+
+
+def multi_rmat(
+    n_edgelists,
+    min_scale,
+    max_scale,
+    edge_factor,
+    size_distribution,
+    edge_distribution,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids
+):
+    """
+    Generate multiple Graph objects using a Recursive MATrix (R-MAT) graph
+    generation algorithm.
+
+    Parameters
+    ----------
+    n_edgelists : int
+    Number of edge lists (graphs) to generate
+
+    min_scale : int
+    Scale factor to set the minimum number of vertices in the graph
+
+    max_scale : int
+    Scale factor to set the maximum number of vertices in the graph
+
+    edge_factor : int
+    Average number of edges per vertex to generate
+
+    size_distribution : int
+    Distribution of the graph sizes, impacts the scale parameter of the R-MAT
+    generator.
+    '0' for POWER_LAW distribution and '1' for UNIFORM distribution
+
+    edge_distribution : int
+    Edges distribution for each graph, impacts how R-MAT parameters a,b,c,d,
+    are set.
+    '0' for POWER_LAW distribution and '1' for UNIFORM distribution
+
+    seed : int
+    Seed value for the random number generator
+
+    clip_and_flip : bool
+    Flag controlling whether to generate edges only in the lower triangular
+    part (including the diagonal) of the graph adjacency matrix
+    (if set to 'true') or not (if set to 'false')
+
+    scramble_vertex_ids : bool
+    Flag controlling whether to scramble vertex ID bits (if set to 'true') or
+    not (if set to 'false'); scrambling vertx ID bits breaks correlation
+    between vertex ID values and vertex degrees
+
+    Returns
+    -------
+    list of cugraph.Graph instances
+    """
+    _ensure_args_multi_rmat(n_edgelists, min_scale, max_scale, edge_factor,
+                            size_distribution, edge_distribution, seed,
+                            clip_and_flip, scramble_vertex_ids)
+
+    dfs = rmat_wrapper.generate_rmat_edgelists(
+        n_edgelists, min_scale,
+        max_scale,
+        edge_factor,
+        size_distribution,
+        edge_distribution,
+        seed,
+        clip_and_flip,
+        scramble_vertex_ids)
+    list_G = []
+
+    for df in dfs:
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(df, source='src', destination='dst')
+        list_G.append(G)
+
+    return list_G
diff --git a/python/cugraph/generators/rmat_wrapper.pyx b/python/cugraph/generators/rmat_wrapper.pyx
new file mode 100644
index 00000000000..26f3772ad32
--- /dev/null
+++ b/python/cugraph/generators/rmat_wrapper.pyx
@@ -0,0 +1,171 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+import numpy as np
+import numpy.ctypeslib as ctypeslib
+from cython.operator cimport dereference as deref
+
+import rmm
+from rmm._lib.device_buffer cimport DeviceBuffer
+import cudf
+from cudf.core.buffer import Buffer
+
+from cugraph.structure.graph_utilities cimport *
+from cugraph.generators.rmat cimport *
+from libcpp.utility cimport move  # This must be imported after graph_utilities
+                                  # since graph_utilities also defines move
+
+
+def generate_rmat_edgelist(
+    scale,
+    num_edges,
+    a,
+    b,
+    c,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids,
+    handle=None
+):
+
+    vertex_t = np.dtype("int32")
+    if (2**scale) > (2**31 - 1):
+        vertex_t = np.dtype("int64")
+
+    cdef unique_ptr[handle_t] handle_ptr
+    cdef size_t handle_size_t
+
+    if handle is None:
+        handle_ptr.reset(new handle_t())
+        handle_ = handle_ptr.get()
+    else:
+        handle_size_t = <size_t>handle.getHandle()
+        handle_ = <handle_t*>handle_size_t
+
+    cdef unique_ptr[graph_generator_t] gg_ret_ptr
+
+    if (vertex_t==np.dtype("int32")):
+        gg_ret_ptr = move(call_generate_rmat_edgelist[int]( deref(handle_),
+                                                    scale,
+                                                    num_edges,
+                                                    a,
+                                                    b,
+                                                    c,
+                                                    seed,
+                                                    clip_and_flip,
+                                                    scramble_vertex_ids))
+    else: # (vertex_t == np.dtype("int64"))
+        gg_ret_ptr = move(call_generate_rmat_edgelist[long]( deref(handle_),
+                                                    scale,
+                                                    num_edges,
+                                                    a,
+                                                    b,
+                                                    c,
+                                                    seed,
+                                                    clip_and_flip,
+                                                    scramble_vertex_ids))
+
+    gg_ret = move(gg_ret_ptr.get()[0])
+    source_set = DeviceBuffer.c_from_unique_ptr(move(gg_ret.d_source))
+    destination_set = DeviceBuffer.c_from_unique_ptr(move(gg_ret.d_destination))
+    source_set = Buffer(source_set)
+    destination_set = Buffer(destination_set)
+
+    set_source = cudf.Series(data=source_set, dtype=vertex_t)
+    set_destination = cudf.Series(data=destination_set, dtype=vertex_t)
+
+    df = cudf.DataFrame()
+    df['src'] = set_source
+    df['dst'] = set_destination
+
+    return df
+
+
+def generate_rmat_edgelists(
+    n_edgelists,
+    min_scale,
+    max_scale,
+    edge_factor,
+    size_distribution,
+    edge_distribution,
+    seed,
+    clip_and_flip,
+    scramble_vertex_ids
+    ):
+
+    vertex_t = np.dtype("int32")
+    if (2**max_scale) > (2**31 - 1):
+        vertex_t = np.dtype("int64")
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get()
+
+    cdef generator_distribution_t s_distribution
+    cdef generator_distribution_t e_distribution
+    if size_distribution == 0:
+        s_distribution= POWER_LAW
+    else :
+        s_distribution= UNIFORM
+    if edge_distribution == 0:
+        e_distribution= POWER_LAW
+    else :
+        e_distribution= UNIFORM
+    #cdef unique_ptr[graph_generator_t*] gg_ret_ptr
+    cdef vector[pair[unique_ptr[device_buffer], unique_ptr[device_buffer]]] gg_ret_ptr
+
+    if (vertex_t==np.dtype("int32")):
+        #gg_ret_ptr = move(call_generate_rmat_edgelists[int]( deref(handle_),
+         gg_ret_ptr = move(call_generate_rmat_edgelists[int]( deref(handle_),
+                                                    n_edgelists,
+                                                    min_scale,
+                                                    max_scale,
+                                                    edge_factor,
+                                                    <generator_distribution_t>s_distribution,
+                                                    <generator_distribution_t>e_distribution,
+                                                    seed,
+                                                    clip_and_flip,
+                                                    scramble_vertex_ids))
+    else: # (vertex_t == np.dtype("int64"))
+        #gg_ret_ptr = move(call_generate_rmat_edgelists[long]( deref(handle_),
+         gg_ret_ptr = move(call_generate_rmat_edgelists[long]( deref(handle_),
+                                                    n_edgelists,
+                                                    min_scale,
+                                                    max_scale,
+                                                    edge_factor,
+                                                    <generator_distribution_t>s_distribution,
+                                                    <generator_distribution_t>e_distribution,
+                                                    seed,
+                                                    clip_and_flip,
+                                                    scramble_vertex_ids))
+    list_df = []
+
+    for i in range(n_edgelists):
+        source_set = DeviceBuffer.c_from_unique_ptr(move(gg_ret_ptr[i].first))
+        destination_set = DeviceBuffer.c_from_unique_ptr(move(gg_ret_ptr[i].second))
+        source_set = Buffer(source_set)
+        destination_set = Buffer(destination_set)
+
+        set_source = cudf.Series(data=source_set, dtype=vertex_t)
+        set_destination = cudf.Series(data=destination_set, dtype=vertex_t)
+
+        df = cudf.DataFrame()
+        df['src'] = set_source
+        df['dst'] = set_destination
+
+        list_df.append(df)
+
+    #Return a list of dataframes
+    return list_df
diff --git a/python/cugraph/internals/callbacks_implems.hpp b/python/cugraph/internals/callbacks_implems.hpp
index 7b3a27f6bff..79fab937965 100644
--- a/python/cugraph/internals/callbacks_implems.hpp
+++ b/python/cugraph/internals/callbacks_implems.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <Python.h>
-#include <internals.hpp>
+#include <cugraph/internals.hpp>
 
 #include <iostream>
 
diff --git a/python/cugraph/layout/force_atlas2.pxd b/python/cugraph/layout/force_atlas2.pxd
index cda55cda5c5..5496d1b655e 100644
--- a/python/cugraph/layout/force_atlas2.pxd
+++ b/python/cugraph/layout/force_atlas2.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,12 +19,13 @@
 from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
-cdef extern from "internals.hpp" namespace "cugraph::internals":
+cdef extern from "cugraph/internals.hpp" namespace "cugraph::internals":
     cdef cppclass GraphBasedDimRedCallback
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef void force_atlas2[vertex_t, edge_t, weight_t](
+        const handle_t &handle,
         GraphCOOView[vertex_t, edge_t, weight_t] &graph,
         float *pos,
         const int max_iter,
diff --git a/python/cugraph/layout/force_atlas2.py b/python/cugraph/layout/force_atlas2.py
index 4c6859c6c03..d15109249e5 100644
--- a/python/cugraph/layout/force_atlas2.py
+++ b/python/cugraph/layout/force_atlas2.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 from cugraph.layout import force_atlas2_wrapper
-from cugraph.structure.graph import null_check
 
 
 def force_atlas2(
@@ -109,13 +108,14 @@ def on_train_end(self, positions):
     """
 
     if pos_list is not None:
-        null_check(pos_list["vertex"])
-        null_check(pos_list["x"])
-        null_check(pos_list["y"])
         if input_graph.renumbered is True:
+            if input_graph.vertex_column_size() > 1:
+                cols = pos_list.columns[:-2].to_list()
+            else:
+                cols = 'vertex'
             pos_list = input_graph.add_internal_vertex_id(pos_list,
                                                           "vertex",
-                                                          "vertex")
+                                                          cols)
 
     if prevent_overlapping:
         raise Exception("Feature not supported")
diff --git a/python/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/layout/force_atlas2_wrapper.pyx
index 785ddda47bd..1644875f034 100644
--- a/python/cugraph/layout/force_atlas2_wrapper.pyx
+++ b/python/cugraph/layout/force_atlas2_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,17 +19,13 @@
 from cugraph.layout.force_atlas2 cimport force_atlas2 as c_force_atlas2
 from cugraph.structure import graph_primtypes_wrapper
 from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import utils_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-
 import cudf
-import cudf._lib as libcudf
 from numba import cuda
 import numpy as np
-import numpy.ctypeslib as ctypeslib
 
-cdef extern from "internals.hpp" namespace "cugraph::internals":
+cdef extern from "cugraph/internals.hpp" namespace "cugraph::internals":
     cdef cppclass GraphBasedDimRedCallback
 
 
@@ -53,6 +49,10 @@ def force_atlas2(input_graph,
     Call force_atlas2
     """
 
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
     if not input_graph.edgelist:
         input_graph.view_edge_list()
 
@@ -65,12 +65,19 @@ def force_atlas2(input_graph,
     df = cudf.DataFrame()
     df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32))
 
-    cdef uintptr_t c_src_indices = input_graph.edgelist.edgelist_df['src'].__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_dst_indices = input_graph.edgelist.edgelist_df['dst'].__cuda_array_interface__['data'][0]
+    src = input_graph.edgelist.edgelist_df['src']
+    dst = input_graph.edgelist.edgelist_df['dst']
+
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([src, dst], [np.int32])
+
+    cdef uintptr_t c_src_indices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_indices = dst.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_weights = <uintptr_t> NULL
 
     if input_graph.edgelist.weights:
-        c_weights = input_graph.edgelist.edgelist_df['weights'].__cuda_array_interface__['data'][0]
+        weights = input_graph.edgelist.edgelist_df["weights"]
+        [weights] = graph_primtypes_wrapper.datatype_cast([weights], [np.float32, np.float64])
+        c_weights = weights.__cuda_array_interface__['data'][0]
 
     cdef uintptr_t x_start = <uintptr_t>NULL
     cdef uintptr_t y_start = <uintptr_t>NULL
@@ -104,7 +111,8 @@ def force_atlas2(input_graph,
         graph_double = GraphCOOView[int,int, double](<int*>c_src_indices,
                         <int*>c_dst_indices, <double*>c_weights, num_verts, num_edges)
 
-        c_force_atlas2[int, int, double](graph_double,
+        c_force_atlas2[int, int, double](handle_[0],
+                        graph_double,
                         <float*>pos_ptr,
                         <int>max_iter,
                         <float*>x_start,
@@ -125,7 +133,8 @@ def force_atlas2(input_graph,
         graph_float = GraphCOOView[int,int,float](<int*>c_src_indices,
                 <int*>c_dst_indices, <float*>c_weights, num_verts,
                 num_edges)
-        c_force_atlas2[int, int, float](graph_float,
+        c_force_atlas2[int, int, float](handle_[0],
+                graph_float,
                 <float*>pos_ptr,
                 <int>max_iter,
                 <float*>x_start,
diff --git a/python/cugraph/linear_assignment/__init__.py b/python/cugraph/linear_assignment/__init__.py
index 9bf09b67ed9..557bbbdf170 100644
--- a/python/cugraph/linear_assignment/__init__.py
+++ b/python/cugraph/linear_assignment/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.linear_assignment.lap import hungarian
+from cugraph.linear_assignment.lap import hungarian, dense_hungarian
diff --git a/python/cugraph/linear_assignment/lap.pxd b/python/cugraph/linear_assignment/lap.pxd
index f7991405b7f..9f65e215891 100644
--- a/python/cugraph/linear_assignment/lap.pxd
+++ b/python/cugraph/linear_assignment/lap.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,11 +18,36 @@
 
 from cugraph.structure.graph_primtypes cimport *
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
-    cdef void hungarian[VT,ET,WT](
+    cdef weight_t hungarian[vertex_t,edge_t,weight_t](
         const handle_t &handle,
-        const GraphCOOView[VT,ET,WT] &graph,
-        VT num_workers,
-        const VT *workers,
-        VT *assignment) except +
+        const GraphCOOView[vertex_t,edge_t,weight_t] &graph,
+        vertex_t num_workers,
+        const vertex_t *workers,
+        vertex_t *assignments,
+        weight_t epsilon) except +
+
+    cdef weight_t hungarian[vertex_t,edge_t,weight_t](
+        const handle_t &handle,
+        const GraphCOOView[vertex_t,edge_t,weight_t] &graph,
+        vertex_t num_workers,
+        const vertex_t *workers,
+        vertex_t *assignments) except +
+
+cdef extern from "cugraph/algorithms.hpp":
+
+    cdef weight_t dense_hungarian "cugraph::dense::hungarian" [vertex_t,weight_t](
+        const handle_t &handle,
+        const weight_t *costs,
+        vertex_t num_rows,
+        vertex_t num_columns,
+        vertex_t *assignments,
+        weight_t epsilon) except +
+
+    cdef weight_t dense_hungarian "cugraph::dense::hungarian" [vertex_t,weight_t](
+        const handle_t &handle,
+        const weight_t *costs,
+        vertex_t num_rows,
+        vertex_t num_columns,
+        vertex_t *assignments) except +
diff --git a/python/cugraph/linear_assignment/lap.py b/python/cugraph/linear_assignment/lap.py
index 5c501d17935..ed40e96fb47 100644
--- a/python/cugraph/linear_assignment/lap.py
+++ b/python/cugraph/linear_assignment/lap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,10 +11,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cudf
 from cugraph.linear_assignment import lap_wrapper
 
 
-def hungarian(G, workers):
+def hungarian(G, workers, epsilon=None):
     """
     Execute the Hungarian algorithm against a symmetric, weighted,
     bipartite graph.
@@ -39,13 +40,21 @@ def hungarian(G, workers):
         as an an edge list.  Edge weights are required. If an edge list is
         not provided then it will be computed.
 
-    workers : cudf.Series
+    workers : cudf.Series or cudf.DataFrame
         A series or column that identifies the vertex ids of the vertices
-        in the workers set.  All vertices in G that are not in the workers
+        in the workers set.  In case of multi-column vertices, it should be a
+        cudf.DataFrame. All vertices in G that are not in the workers
         set are implicitly assigned to the jobs set.
 
+    epsilon : float or double (matching weight type in graph)
+        Used for determining when value is close enough to zero to consider 0.
+        Defaults (if not specified) to 1e-6 in the C++ code.  Unused for
+        integer weight types.
+
     Returns
     -------
+    cost : matches costs.dtype
+        The cost of the overall assignment
     df : cudf.DataFrame
       df['vertex'][i] gives the vertex id of the i'th vertex.  Only vertices
                       in the workers list are defined in this column.
@@ -60,18 +69,64 @@ def hungarian(G, workers):
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
-    >>> df = cugraph.hungarian(G, workers)
+    >>> cost, df = cugraph.hungarian(G, workers)
 
     """
 
     if G.renumbered:
-        local_workers = G.lookup_internal_vertex_id(workers)
+        if isinstance(workers, cudf.DataFrame):
+            local_workers = G.lookup_internal_vertex_id(workers,
+                                                        workers.columns)
+        else:
+            local_workers = G.lookup_internal_vertex_id(workers)
     else:
         local_workers = workers
 
-    df = lap_wrapper.hungarian(G, local_workers)
+    cost, df = lap_wrapper.sparse_hungarian(G, local_workers, epsilon)
 
     if G.renumbered:
         df = G.unrenumber(df, 'vertex')
 
-    return df
+    return cost, df
+
+
+def dense_hungarian(costs, num_rows, num_columns, epsilon=None):
+    """
+    Execute the Hungarian algorithm against a dense bipartite
+    graph representation.
+
+    *NOTE*: This API is unstable and subject to change
+
+    The Hungarian algorithm identifies the lowest cost matching of vertices
+    such that all workers that can be assigned work are assigned exactly
+    on job.
+
+    Parameters
+    ----------
+    costs : cudf.Series
+        A dense representation (row major order) of the bipartite
+        graph.  Each row represents a worker, each column represents
+        a task, cost[i][j] represents the cost of worker i performing
+        task j.
+    num_rows : int
+        Number of rows in the matrix
+    num_columns : int
+        Number of columns in the matrix
+    epsilon : float or double (matching weight type in graph)
+        Used for determining when value is close enough to zero to consider 0.
+        Defaults (if not specified) to 1e-6 in the C++ code.  Unused for
+        integer weight types.
+
+    Returns
+    -------
+    cost : matches costs.dtype
+        The cost of the overall assignment
+    assignment : cudf.Series
+      assignment[i] gives the vertex id of the task assigned to the
+                    worker i
+
+    FIXME: Update this with a real example...
+
+    """
+
+    return lap_wrapper.dense_hungarian(costs, num_rows, num_columns, epsilon)
diff --git a/python/cugraph/linear_assignment/lap_wrapper.pyx b/python/cugraph/linear_assignment/lap_wrapper.pyx
index caaa837e859..c173f45fa3f 100644
--- a/python/cugraph/linear_assignment/lap_wrapper.pyx
+++ b/python/cugraph/linear_assignment/lap_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,15 +17,15 @@
 # cython: language_level = 3
 
 from cugraph.linear_assignment.lap cimport hungarian as c_hungarian
+from cugraph.linear_assignment.lap cimport dense_hungarian as c_dense_hungarian
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
-from cugraph.structure.graph import Graph as type_Graph
-
 import cudf
 import numpy as np
 
-def hungarian(input_graph, workers):
+
+def sparse_hungarian(input_graph, workers, epsilon):
     """
     Call the hungarian algorithm
     """
@@ -62,6 +62,9 @@ def hungarian(input_graph, workers):
     df['vertex'] = workers
     df['assignment'] = cudf.Series(np.zeros(len(workers), dtype=np.int32))
 
+    if epsilon == None:
+        epsilon = 1e-6
+
     cdef uintptr_t c_src        = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst        = dst.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_weights    = weights.__cuda_array_interface__['data'][0]
@@ -69,6 +72,8 @@ def hungarian(input_graph, workers):
 
     cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0];
     cdef uintptr_t c_assignment = df['assignment'].__cuda_array_interface__['data'][0];
+    cdef float c_epsilon_float = epsilon
+    cdef double c_epsilon_double = epsilon
 
     cdef GraphCOOView[int,int,float] g_float
     cdef GraphCOOView[int,int,double] g_double
@@ -76,10 +81,43 @@ def hungarian(input_graph, workers):
     if weights.dtype == np.float32:
         g_float = GraphCOOView[int,int,float](<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges)
 
-        c_hungarian[int,int,float](handle_[0], g_float, len(workers), <int*>c_workers, <int*>c_assignment)
+        cost = c_hungarian[int,int,float](handle_[0], g_float, len(workers), <int*>c_workers, <int*>c_assignment, c_epsilon_float)
     else:
         g_double = GraphCOOView[int,int,double](<int*>c_src, <int*>c_dst, <double*>c_weights, num_verts, num_edges)
 
-        c_hungarian[int,int,double](handle_[0], g_double, len(workers), <int*>c_workers, <int*>c_assignment)
+        cost = c_hungarian[int,int,double](handle_[0], g_double, len(workers), <int*>c_workers, <int*>c_assignment, c_epsilon_double)
+
+    return cost, df
+
+
+def dense_hungarian(costs, num_rows, num_columns, epsilon):
+    """
+    Call the dense hungarian algorithm
+    """
+    if type(costs) is not cudf.Series:
+        raise("costs must be a cudf.Series")
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
+    assignment = cudf.Series(np.zeros(num_rows, dtype=np.int32))
+
+    if epsilon == None:
+        epsilon = 1e-6
+
+    cdef uintptr_t c_costs = costs.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_assignment = assignment.__cuda_array_interface__['data'][0]
+    cdef float c_epsilon_float = epsilon
+    cdef double c_epsilon_double = epsilon
+
+    if costs.dtype == np.float32:
+        cost = c_dense_hungarian[int,float](handle_[0], <float*> c_costs, num_rows, num_columns, <int*> c_assignment, c_epsilon_float)
+    elif costs.dtype == np.float64:
+        cost = c_dense_hungarian[int,double](handle_[0], <double*> c_costs, num_rows, num_columns, <int*> c_assignment, c_epsilon_double)
+    elif costs.dtype == np.int32:
+        cost = c_dense_hungarian[int,double](handle_[0], <double*> c_costs, num_rows, num_columns, <int*> c_assignment)
+    else:
+        raise("unsported type: ", costs.dtype)
 
-    return df
+    return cost, assignment
diff --git a/python/cugraph/link_analysis/hits.pxd b/python/cugraph/link_analysis/hits.pxd
index 60d25fd3cdb..9e40f7444f9 100644
--- a/python/cugraph/link_analysis/hits.pxd
+++ b/python/cugraph/link_analysis/hits.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@ from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph::gunrock":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph::gunrock":
 
     cdef void hits[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
diff --git a/python/cugraph/link_analysis/hits_wrapper.pyx b/python/cugraph/link_analysis/hits_wrapper.pyx
index 3e19e38a023..2a2d33dea0b 100644
--- a/python/cugraph/link_analysis/hits_wrapper.pyx
+++ b/python/cugraph/link_analysis/hits_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,13 +18,10 @@
 
 from cugraph.link_analysis.hits cimport hits as c_hits
 from cugraph.structure.graph_primtypes cimport *
-from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 from cugraph.structure import graph_primtypes_wrapper
 import cudf
-import rmm
 import numpy as np
-import numpy.ctypeslib as ctypeslib
 
 
 def hits(input_graph, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
@@ -48,8 +45,6 @@ def hits(input_graph, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
     df['hubs'] = cudf.Series(np.zeros(num_verts, dtype=np.float32))
     df['authorities'] = cudf.Series(np.zeros(num_verts, dtype=np.float32))
 
-    #cdef bool normalized = <bool> 1
-
     cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0];
     cdef uintptr_t c_hubs = df['hubs'].__cuda_array_interface__['data'][0];
     cdef uintptr_t c_authorities = df['authorities'].__cuda_array_interface__['data'][0];
diff --git a/python/cugraph/link_analysis/pagerank.pxd b/python/cugraph/link_analysis/pagerank.pxd
index 79cb033f74b..ed8f763b3ca 100644
--- a/python/cugraph/link_analysis/pagerank.pxd
+++ b/python/cugraph/link_analysis/pagerank.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,11 +16,11 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_pagerank[VT,WT](
         const handle_t &handle,
diff --git a/python/cugraph/link_analysis/pagerank.py b/python/cugraph/link_analysis/pagerank.py
index 69133d62af7..94b1491e944 100644
--- a/python/cugraph/link_analysis/pagerank.py
+++ b/python/cugraph/link_analysis/pagerank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 from cugraph.link_analysis import pagerank_wrapper
-from cugraph.structure.graph import null_check
 import cugraph
 
 
@@ -32,7 +31,7 @@ def pagerank(
     ----------
     graph : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm).
+        as an edge list.
         The transposed adjacency list will be computed if not already present.
     alpha : float
         The damping factor alpha represents the probability to follow an
@@ -46,7 +45,6 @@ def pagerank(
             Subset of vertices of graph for personalization
         personalization['values'] : cudf.Series
             Personalization values for vertices
-
     max_iter : int
         The maximum number of iterations before an answer is returned. This can
         be used to limit the execution time and do an early exit before the
@@ -68,11 +66,10 @@ def pagerank(
             Subset of vertices of graph for initial guess for pagerank values
         nstart['values'] : cudf.Series
             Pagerank values for vertices
-
-    weight : str
-        Edge data column to use.  Default is None
-        This version of PageRank current does not use edge weight.
-        This parameter is here for NetworkX compatibility
+    weight: str
+        The attribute column to be used as edge weights if Graph is a NetworkX
+        Graph. This parameter is here for NetworkX compatibility and is ignored
+        in case of a cugraph.Graph
     dangling : dict
         This parameter is here for NetworkX compatibility and ignored
 
@@ -100,17 +97,23 @@ def pagerank(
     G, isNx = cugraph.utilities.check_nx_graph(G, weight)
 
     if personalization is not None:
-        null_check(personalization["vertex"])
-        null_check(personalization["values"])
         if G.renumbered is True:
+            if len(G.renumber_map.implementation.col_names) > 1:
+                cols = personalization.columns[:-1].to_list()
+            else:
+                cols = 'vertex'
             personalization = G.add_internal_vertex_id(
-                personalization, "vertex", "vertex"
+                personalization, "vertex", cols
             )
 
     if nstart is not None:
         if G.renumbered is True:
+            if len(G.renumber_map.implementation.col_names) > 1:
+                cols = nstart.columns[:-1].to_list()
+            else:
+                cols = 'vertex'
             nstart = G.add_internal_vertex_id(
-                nstart, "vertex", "vertex"
+                nstart, "vertex", cols
             )
 
     df = pagerank_wrapper.pagerank(
diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx
index a8c1c9faee8..7198ccabc9e 100644
--- a/python/cugraph/link_analysis/pagerank_wrapper.pyx
+++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,16 +16,13 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-#cimport cugraph.link_analysis.pagerank as c_pagerank
 from cugraph.link_analysis.pagerank cimport call_pagerank
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 from cugraph.structure import graph_primtypes_wrapper
 import cudf
-import rmm
 import numpy as np
-import numpy.ctypeslib as ctypeslib
 
 
 def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None):
@@ -33,21 +30,22 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
     Call pagerank
     """
 
-    if not input_graph.transposedadjlist:
-        input_graph.view_transposed_adj_list()
-
     cdef unique_ptr[handle_t] handle_ptr
     handle_ptr.reset(new handle_t())
     handle_ = handle_ptr.get();
 
-    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, input_graph.transposedadjlist.indices], [np.int32])
-    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.weights], [np.float32, np.float64])
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32])
+    weights = None
+    if input_graph.edgelist.weights:
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
+    # FIXME: needs to be edge_t type not int
+    cdef int num_local_edges = len(src)
 
     df = cudf.DataFrame()
-    df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32))
     df['pagerank'] = cudf.Series(np.zeros(num_verts, dtype=np.float32))
 
     cdef bool has_guess = <bool> 0
@@ -62,22 +60,23 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
 
     cdef uintptr_t c_pers_vtx = <uintptr_t>NULL
     cdef uintptr_t c_pers_val = <uintptr_t>NULL
-    cdef sz = 0
-
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_weights = <uintptr_t>NULL
-    cdef uintptr_t c_local_verts = <uintptr_t> NULL;
-    cdef uintptr_t c_local_edges = <uintptr_t> NULL;
-    cdef uintptr_t c_local_offsets = <uintptr_t> NULL;
+    cdef int sz = 0
 
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    
     personalization_id_series = None
 
     if weights is not None:
-        c_weights = weights.__cuda_array_interface__['data'][0]
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
         weight_t = weights.dtype
+        is_weighted = True
     else:
         weight_t = np.dtype("float32")
+        is_weighted = False
+
+    is_symmetric = not input_graph.is_directed()
 
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
@@ -94,15 +93,21 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
         c_pers_val = personalization['values'].__cuda_array_interface__['data'][0]
 
     cdef graph_container_t graph_container
-    populate_graph_container_legacy(graph_container,
-                                    <graphTypeEnum>(<int>(graphTypeEnum.LegacyCSC)),
-                                    handle_[0],
-                                    <void*>c_offsets, <void*>c_indices, <void*>c_weights,
-                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
-                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
-                                    <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                                    num_verts, num_edges,
-                                    <int*>c_local_verts, <int*>c_local_edges, <int*>c_local_offsets)
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <void*>NULL,
+                             0,
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_local_edges,
+                             num_verts, num_edges,
+                             is_weighted,
+                             is_symmetric,
+                             True,
+                             False)
 
     if (df['pagerank'].dtype == np.float32):
         call_pagerank[int, float](handle_[0], graph_container,
diff --git a/python/cugraph/link_prediction/jaccard.pxd b/python/cugraph/link_prediction/jaccard.pxd
index bc55bb2cdf0..9e8c82ec3d8 100644
--- a/python/cugraph/link_prediction/jaccard.pxd
+++ b/python/cugraph/link_prediction/jaccard.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef void jaccard[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
diff --git a/python/cugraph/link_prediction/jaccard.py b/python/cugraph/link_prediction/jaccard.py
index 71cf0925342..e69308ac595 100644
--- a/python/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/link_prediction/jaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,11 +13,11 @@
 
 import pandas as pd
 import cudf
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.link_prediction import jaccard_wrapper
-from cugraph.structure.graph import null_check
 from cugraph.utilities import check_nx_graph
 from cugraph.utilities import df_edge_score_to_dictionary
+from cugraph.utilities import renumber_vertex_pair
 
 
 def jaccard(input_graph, vertex_pair=None):
@@ -109,15 +109,8 @@ def jaccard(input_graph, vertex_pair=None):
     if type(input_graph) is not Graph:
         raise Exception("input graph must be undirected")
 
-    # FIXME: Add support for multi-column vertices
     if type(vertex_pair) == cudf.DataFrame:
-        for col in vertex_pair.columns:
-            null_check(vertex_pair[col])
-            if input_graph.renumbered:
-                vertex_pair = input_graph.add_internal_vertex_id(
-                    vertex_pair, col, col
-                )
-
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
     elif vertex_pair is None:
         pass
     else:
diff --git a/python/cugraph/link_prediction/jaccard_wrapper.pyx b/python/cugraph/link_prediction/jaccard_wrapper.pyx
index cacd13dec65..8d236c60ee2 100644
--- a/python/cugraph/link_prediction/jaccard_wrapper.pyx
+++ b/python/cugraph/link_prediction/jaccard_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,8 +21,6 @@ from cugraph.link_prediction.jaccard cimport jaccard_list as c_jaccard_list
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
-from cython cimport floating
-
 import cudf
 import numpy as np
 
diff --git a/python/cugraph/link_prediction/overlap.pxd b/python/cugraph/link_prediction/overlap.pxd
index 970032b56eb..f0654472587 100644
--- a/python/cugraph/link_prediction/overlap.pxd
+++ b/python/cugraph/link_prediction/overlap.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef void overlap[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
diff --git a/python/cugraph/link_prediction/overlap.py b/python/cugraph/link_prediction/overlap.py
index a5ca1e22979..4650f24f181 100644
--- a/python/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/link_prediction/overlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,10 +13,10 @@
 
 import pandas as pd
 from cugraph.link_prediction import overlap_wrapper
-from cugraph.structure.graph import null_check
 import cudf
 from cugraph.utilities import check_nx_graph
 from cugraph.utilities import df_edge_score_to_dictionary
+from cugraph.utilities import renumber_vertex_pair
 
 
 def overlap_coefficient(G, ebunch=None):
@@ -91,14 +91,8 @@ def overlap(input_graph, vertex_pair=None):
     >>> df = cugraph.overlap(G)
     """
 
-    # FIXME: Add support for multi-column vertices
     if type(vertex_pair) == cudf.DataFrame:
-        for col in vertex_pair.columns:
-            null_check(vertex_pair[col])
-            if input_graph.renumbered:
-                vertex_pair = input_graph.add_internal_vertex_id(
-                    vertex_pair, col, col,
-                )
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
     elif vertex_pair is None:
         pass
     else:
diff --git a/python/cugraph/link_prediction/overlap_wrapper.pyx b/python/cugraph/link_prediction/overlap_wrapper.pyx
index 9e2f3ba49d7..ec0274716fb 100644
--- a/python/cugraph/link_prediction/overlap_wrapper.pyx
+++ b/python/cugraph/link_prediction/overlap_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,8 +21,6 @@ from cugraph.link_prediction.overlap cimport overlap_list as c_overlap_list
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
-from cython cimport floating
-
 import cudf
 import numpy as np
 
@@ -70,8 +68,9 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
         df = cudf.DataFrame()
         df['overlap_coeff'] = result
 
-        first = vertex_pair['first']
-        second = vertex_pair['second']
+        cols = vertex_pair.columns.to_list()
+        first = vertex_pair[cols[0]]
+        second = vertex_pair[cols[1]]
 
         # FIXME: multi column support
         df['source'] = first
diff --git a/python/cugraph/link_prediction/wjaccard.py b/python/cugraph/link_prediction/wjaccard.py
index 2a4e2417102..9616bfd49a8 100644
--- a/python/cugraph/link_prediction/wjaccard.py
+++ b/python/cugraph/link_prediction/wjaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,10 +11,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.link_prediction import jaccard_wrapper
-from cugraph.structure.graph import null_check
 import cudf
+import numpy as np
+from cugraph.utilities import renumber_vertex_pair
 
 
 def jaccard_w(input_graph, weights, vertex_pair=None):
@@ -36,8 +37,15 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
         as an edge list (edge weights are not used for this algorithm). The
         adjacency list will be computed if not already present.
 
-    weights : cudf.Series
+    weights : cudf.DataFrame
         Specifies the weights to be used for each vertex.
+        Vertex should be represented by multiple columns for multi-column
+        vertices.
+
+        weights['vertex'] : cudf.Series
+            Contains the vertex identifiers
+        weights['weight'] : cudf.Series
+            Contains the weights of vertices
 
     vertex_pair : cudf.DataFrame
         A GPU dataframe consisting of two columns representing pairs of
@@ -71,20 +79,28 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     if type(input_graph) is not Graph:
         raise Exception("input graph must be undirected")
 
-    # FIXME: Add support for multi-column vertices
     if type(vertex_pair) == cudf.DataFrame:
-        for col in vertex_pair.columns:
-            null_check(vertex_pair[col])
-            if input_graph.renumbered:
-                vertex_pair = input_graph.add_internal_vertex_id(
-                    vertex_pair, col, col,
-                )
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
     elif vertex_pair is None:
         pass
     else:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
-    df = jaccard_wrapper.jaccard(input_graph, weights, vertex_pair)
+    if input_graph.renumbered:
+        vertex_size = input_graph.vertex_column_size()
+        if vertex_size == 1:
+            weights = input_graph.add_internal_vertex_id(
+                weights, 'vertex', 'vertex'
+            )
+        else:
+            cols = weights.columns[:vertex_size].to_list()
+            weights = input_graph.add_internal_vertex_id(
+                weights, 'vertex', cols
+            )
+    jaccard_weights = cudf.Series(np.ones(len(weights)))
+    for i in range(len(weights)):
+        jaccard_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]
+    df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)
 
     if input_graph.renumbered:
         df = input_graph.unrenumber(df, "source")
diff --git a/python/cugraph/link_prediction/woverlap.py b/python/cugraph/link_prediction/woverlap.py
index c93ad28ea54..920d3e3f80d 100644
--- a/python/cugraph/link_prediction/woverlap.py
+++ b/python/cugraph/link_prediction/woverlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,8 +12,9 @@
 # limitations under the License.
 
 from cugraph.link_prediction import overlap_wrapper
-from cugraph.structure.graph import null_check
 import cudf
+import numpy as np
+from cugraph.utilities import renumber_vertex_pair
 
 
 def overlap_w(input_graph, weights, vertex_pair=None):
@@ -67,20 +68,33 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     >>> G.from_cudf_edgelist(M, source='0', destination='1')
     >>> df = cugraph.overlap_w(G, M[2])
     """
-    # FIXME: Add support for multi-column vertices
+
     if type(vertex_pair) == cudf.DataFrame:
-        for col in vertex_pair.columns:
-            null_check(vertex_pair[col])
-            if input_graph.renumbered:
-                vertex_pair = input_graph.add_internal_vertex_id(
-                    vertex_pair, col, col
-                )
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
     elif vertex_pair is None:
         pass
     else:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
-    df = overlap_wrapper.overlap(input_graph, weights, vertex_pair)
+    if input_graph.renumbered:
+        vertex_size = input_graph.vertex_column_size()
+        if vertex_size == 1:
+            weights = input_graph.add_internal_vertex_id(
+                weights, 'vertex', 'vertex'
+            )
+        else:
+            cols = weights.columns[:vertex_size].to_list()
+            weights = input_graph.add_internal_vertex_id(
+                weights, 'vertex', cols
+            )
+
+    overlap_weights = cudf.Series(np.ones(len(weights)))
+    for i in range(len(weights)):
+        overlap_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]
+
+    overlap_weights = overlap_weights.astype('float32')
+
+    df = overlap_wrapper.overlap(input_graph, overlap_weights, vertex_pair)
 
     if input_graph.renumbered:
         df = input_graph.unrenumber(df, "source")
diff --git a/python/cugraph/bsp/traversal/__init__.py b/python/cugraph/sampling/__init__.py
similarity index 83%
rename from python/cugraph/bsp/traversal/__init__.py
rename to python/cugraph/sampling/__init__.py
index 061d1d7e3a1..ab0bfab0c66 100644
--- a/python/cugraph/bsp/traversal/__init__.py
+++ b/python/cugraph/sampling/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,5 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.bsp.traversal.bfs_bsp import bfs_df_pregel
-
+from cugraph.sampling.random_walks import random_walks, rw_path
diff --git a/python/cugraph/sampling/random_walks.pxd b/python/cugraph/sampling/random_walks.pxd
new file mode 100644
index 00000000000..f86d6396c98
--- /dev/null
+++ b/python/cugraph/sampling/random_walks.pxd
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
+
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+    cdef unique_ptr[random_walk_ret_t] call_random_walks[vertex_t, edge_t](
+      const handle_t &handle,
+      const graph_container_t &g,
+      const vertex_t *ptr_d_start,
+      edge_t num_paths,
+      edge_t max_depth,
+      bool use_padding) except +
+
+    cdef unique_ptr[random_walk_path_t] call_rw_paths[index_t](
+      const handle_t &handle,
+      index_t num_paths,
+      const index_t* sizes) except +
diff --git a/python/cugraph/sampling/random_walks.py b/python/cugraph/sampling/random_walks.py
new file mode 100644
index 00000000000..fc21abd3bc4
--- /dev/null
+++ b/python/cugraph/sampling/random_walks.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cudf
+from cugraph.sampling import random_walks_wrapper
+import cugraph
+
+
+def random_walks(G,
+                 start_vertices,
+                 max_depth=None,
+                 use_padding=False):
+    """
+    compute random walks for each nodes in 'start_vertices'
+
+    parameters
+    ----------
+    G : cuGraph.Graph or networkx.Graph
+        The graph can be either directed (DiGraph) or undirected (Graph).
+        Weights in the graph are ignored.
+        Use weight parameter if weights need to be considered
+        (currently not supported)
+
+    start_vertices : int or list or cudf.Series or cudf.DataFrame
+        A single node or a list or a cudf.Series of nodes from which to run
+        the random walks. In case of multi-column vertices it should be
+        a cudf.DataFrame
+
+    max_depth : int
+        The maximum depth of the random walks
+
+    use_padding : bool
+        If True, padded paths are returned else coalesced paths are returned.
+
+    Returns
+    -------
+    vertex_paths : cudf.Series or cudf.DataFrame
+        Series containing the vertices of edges/paths in the random walk.
+
+    edge_weight_paths: cudf.Series
+        Series containing the edge weights of edges represented by the
+        returned vertex_paths
+
+    sizes: int
+        The path size in case of coalesced paths.
+    """
+    if max_depth is None:
+        raise TypeError("must specify a 'max_depth'")
+
+    G, _ = cugraph.utilities.check_nx_graph(G)
+
+    if start_vertices is int:
+        start_vertices = [start_vertices]
+
+    if isinstance(start_vertices, list):
+        start_vertices = cudf.Series(start_vertices)
+
+    if G.renumbered is True:
+        if isinstance(start_vertices, cudf.DataFrame):
+            start_vertices = G.lookup_internal_vertex_id(
+                start_vertices,
+                start_vertices.columns)
+        else:
+            start_vertices = G.lookup_internal_vertex_id(start_vertices)
+
+    vertex_set, edge_set, sizes = random_walks_wrapper.random_walks(
+        G, start_vertices, max_depth, use_padding)
+
+    if G.renumbered:
+        df_ = cudf.DataFrame()
+        df_['vertex_set'] = vertex_set
+        df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True)
+        vertex_set = cudf.Series(df_['vertex_set'])
+
+    if use_padding:
+        edge_set_sz = (max_depth-1)*len(start_vertices)
+        return vertex_set, edge_set[:edge_set_sz], sizes
+
+    vertex_set_sz = sizes.sum()
+    edge_set_sz = vertex_set_sz - len(start_vertices)
+    return vertex_set[:vertex_set_sz], edge_set[:edge_set_sz], sizes
+
+
+def rw_path(num_paths, sizes):
+    """
+    Retrieve more information on the obtained paths in case use_padding
+    is False.
+
+    parameters
+    ----------
+    num_paths: int
+        Number of paths in the random walk output.
+
+    sizes: int
+        Path size returned in random walk output.
+
+    Returns
+    -------
+    path_data : cudf.DataFrame
+        Dataframe containing vetex path offsets, edge weight offsets and
+        edge weight sizes for each path.
+    """
+    return random_walks_wrapper.rw_path_retrieval(num_paths, sizes)
diff --git a/python/cugraph/sampling/random_walks_wrapper.pyx b/python/cugraph/sampling/random_walks_wrapper.pyx
new file mode 100644
index 00000000000..688ece9595b
--- /dev/null
+++ b/python/cugraph/sampling/random_walks_wrapper.pyx
@@ -0,0 +1,162 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from cugraph.sampling.random_walks cimport call_random_walks, call_rw_paths
+#from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
+from libcpp cimport bool
+from libcpp.utility cimport move
+from libc.stdint cimport uintptr_t
+from cugraph.structure import graph_primtypes_wrapper
+import cudf
+import rmm
+import numpy as np
+import numpy.ctypeslib as ctypeslib
+from rmm._lib.device_buffer cimport DeviceBuffer
+from cudf.core.buffer import Buffer
+from cython.operator cimport dereference as deref
+
+
+def random_walks(input_graph, start_vertices, max_depth, use_padding):
+    """
+    Call random_walks
+    """
+    # FIXME: Offsets and indices are currently hardcoded to int, but this may
+    #        not be acceptable in the future.
+    numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                     np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                     np.dtype("float32") : <int>numberTypeEnum.floatType,
+                     np.dtype("double") : <int>numberTypeEnum.doubleType}
+    [src, dst] = [input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']]
+    vertex_t = src.dtype
+    edge_t = np.dtype("int32")
+    weights = None
+    if input_graph.edgelist.weights:
+        weights = input_graph.edgelist.edgelist_df['weights']
+    num_verts = input_graph.number_of_vertices()
+    num_edges = input_graph.number_of_edges(directed_edges=True)
+    num_partition_edges = num_edges
+    
+    if num_edges > (2**31 - 1):
+        edge_t = np.dtype("int64")
+    cdef unique_ptr[random_walk_ret_t] rw_ret_ptr 
+    
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
+        weight_t = weights.dtype
+        is_weighted = True
+    else:
+        weight_t = np.dtype("float32")
+        is_weighted = False
+
+    is_symmetric = not input_graph.is_directed()
+
+    # Pointers for random_walks
+    start_vertices = start_vertices.astype('int32')
+    cdef uintptr_t c_start_vertex_ptr = start_vertices.__cuda_array_interface__['data'][0]
+    num_paths = start_vertices.size
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get()
+    cdef graph_container_t graph_container
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <void*>NULL,
+                             0,
+                             <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_partition_edges,
+                             num_verts,
+                             num_edges,
+                             is_weighted,
+                             is_symmetric,
+                             False, False)
+    if(vertex_t == np.dtype("int32")):
+        if(edge_t == np.dtype("int32")):
+            rw_ret_ptr = move(call_random_walks[int, int]( deref(handle_),
+                                                           graph_container,
+                                                           <int*> c_start_vertex_ptr,
+                                                           <int> num_paths,
+                                                           <int> max_depth,
+                                                           <bool> use_padding))
+        else: # (edge_t == np.dtype("int64")):
+            rw_ret_ptr = move(call_random_walks[int, long]( deref(handle_),
+                                                           graph_container,
+                                                           <int*> c_start_vertex_ptr,
+                                                           <long> num_paths,
+                                                           <long> max_depth,
+                                                           <bool> use_padding))
+    else: # (vertex_t == edge_t == np.dtype("int64")):
+        rw_ret_ptr = move(call_random_walks[long, long]( deref(handle_),
+                                                           graph_container,
+                                                           <long*> c_start_vertex_ptr,
+                                                           <long> num_paths,
+                                                           <long> max_depth,
+                                                           <bool> use_padding))
+
+    
+    rw_ret= move(rw_ret_ptr.get()[0])
+    vertex_set = DeviceBuffer.c_from_unique_ptr(move(rw_ret.d_coalesced_v_))
+    edge_set = DeviceBuffer.c_from_unique_ptr(move(rw_ret.d_coalesced_w_))
+    vertex_set = Buffer(vertex_set)
+    edge_set = Buffer(edge_set)
+
+    set_vertex = cudf.Series(data=vertex_set, dtype=vertex_t)
+    set_edge = cudf.Series(data=edge_set, dtype=weight_t)
+
+    if not use_padding:
+        sizes = DeviceBuffer.c_from_unique_ptr(move(rw_ret.d_sizes_))
+        sizes = Buffer(sizes)
+        set_sizes = cudf.Series(data=sizes, dtype=edge_t)
+    else:
+        set_sizes = None
+
+    return set_vertex, set_edge, set_sizes
+
+
+def rw_path_retrieval(num_paths, sizes):
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get()
+    index_t = sizes.dtype
+
+    cdef unique_ptr[random_walk_path_t] rw_path_ptr
+    cdef uintptr_t c_sizes = sizes.__cuda_array_interface__['data'][0]
+
+    if index_t == np.dtype("int32"):
+        rw_path_ptr = move(call_rw_paths[int](deref(handle_),
+                                              <int>num_paths,
+                                              <int*>c_sizes))
+    else: # index_t == np.dtype("int64"):
+        rw_path_ptr = move(call_rw_paths[long](deref(handle_),
+                                               <long>num_paths,
+                                               <long*>c_sizes))
+
+    rw_path = move(rw_path_ptr.get()[0])
+    vertex_offsets = DeviceBuffer.c_from_unique_ptr(move(rw_path.d_v_offsets))
+    weight_sizes = DeviceBuffer.c_from_unique_ptr(move(rw_path.d_w_sizes))
+    weight_offsets = DeviceBuffer.c_from_unique_ptr(move(rw_path.d_w_offsets))
+    vertex_offsets = Buffer(vertex_offsets)
+    weight_sizes = Buffer(weight_sizes)
+    weight_offsets = Buffer(weight_offsets)
+
+    df = cudf.DataFrame()
+    df['vertex_offsets'] = cudf.Series(data=vertex_offsets, dtype=index_t)
+    df['weight_sizes'] = cudf.Series(data=weight_sizes, dtype=index_t)
+    df['weight_offsets'] = cudf.Series(data=weight_offsets, dtype=index_t)
+    return df
diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py
index 34447e80ee9..b70854d61ce 100644
--- a/python/cugraph/structure/__init__.py
+++ b/python/cugraph/structure/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph import Graph, DiGraph
+from cugraph.structure.graph_classes import (Graph,
+                                             DiGraph,
+                                             MultiGraph,
+                                             MultiDiGraph,
+                                             BiPartiteGraph,
+                                             BiPartiteDiGraph)
+from cugraph.structure.graph_classes import (is_weighted,
+                                             is_directed,
+                                             is_multigraph,
+                                             is_bipartite,
+                                             is_multipartite)
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf
 from cugraph.structure.convert_matrix import (from_edgelist,
diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py
index edd1c630185..5b3c375ea9d 100644
--- a/python/cugraph/structure/convert_matrix.py
+++ b/python/cugraph/structure/convert_matrix.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,7 @@
 import cudf
 import dask_cudf
 
-from cugraph.structure.graph import DiGraph, Graph
+from cugraph.structure.graph_classes import DiGraph, Graph
 
 # optional dependencies used for handling different input types
 try:
diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py
deleted file mode 100644
index 53c3a4e656c..00000000000
--- a/python/cugraph/structure/graph.py
+++ /dev/null
@@ -1,1527 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.symmetrize import symmetrize
-from cugraph.structure.number_map import NumberMap
-from cugraph.dask.common.input_utils import get_local_data
-import cugraph.dask.common.mg_utils as mg_utils
-import cudf
-import dask_cudf
-import cugraph.comms.comms as Comms
-import pandas as pd
-import numpy as np
-from cugraph.dask.structure import replication
-
-
-def null_check(col):
-    if col.null_count != 0:
-        raise ValueError("Series contains NULL values")
-
-
-class Graph:
-    class EdgeList:
-        def __init__(self, *args):
-            if len(args) == 1:
-                self.__from_dask_cudf(*args)
-            else:
-                self.__from_cudf(*args)
-
-        def __from_cudf(self, source, destination, edge_attr=None):
-            self.edgelist_df = cudf.DataFrame()
-            self.edgelist_df["src"] = source
-            self.edgelist_df["dst"] = destination
-            self.weights = False
-            if edge_attr is not None:
-                self.weights = True
-                if type(edge_attr) is dict:
-                    for k in edge_attr.keys():
-                        self.edgelist_df[k] = edge_attr[k]
-                else:
-                    self.edgelist_df["weights"] = edge_attr
-
-        def __from_dask_cudf(self, ddf):
-            self.edgelist_df = ddf
-            self.weights = False
-            # FIXME: Edge Attribute not handled
-
-    class AdjList:
-        def __init__(self, offsets, indices, value=None):
-            self.offsets = offsets
-            self.indices = indices
-            self.weights = value  # Should be a dataframe for multiple weights
-
-    class transposedAdjList:
-        def __init__(self, offsets, indices, value=None):
-            Graph.AdjList.__init__(self, offsets, indices, value)
-
-    """
-    cuGraph graph class containing basic graph creation and transformation
-    operations.
-    """
-
-    def __init__(
-        self,
-        m_graph=None,
-        edge_attr=None,
-        symmetrized=False,
-        bipartite=False,
-        multi=False,
-        dynamic=False,
-    ):
-        """
-        Returns
-        -------
-        G : cuGraph.Graph.
-
-        Examples
-        --------
-        >>> import cuGraph
-        >>> G = cuGraph.Graph()
-
-        """
-        self.symmetrized = symmetrized
-        self.renumbered = False
-        self.renumber_map = None
-        self.bipartite = False
-        self.multipartite = False
-        self._nodes = {}
-        self.multi = multi
-        self.distributed = False
-        self.dynamic = dynamic
-        self.self_loop = False
-        self.edgelist = None
-        self.adjlist = None
-        self.transposedadjlist = None
-        self.edge_count = None
-        self.node_count = None
-
-        # MG - Batch
-        self.batch_enabled = False
-        self.batch_edgelists = None
-        self.batch_adjlists = None
-        self.batch_transposed_adjlists = None
-
-        if m_graph is not None:
-            if (type(self) is Graph and type(m_graph) is MultiGraph) or (
-                type(self) is DiGraph and type(m_graph) is MultiDiGraph
-            ):
-                self.from_cudf_edgelist(
-                    m_graph.edgelist.edgelist_df,
-                    source="src",
-                    destination="dst",
-                    edge_attr=edge_attr,
-                )
-                self.renumbered = m_graph.renumbered
-                self.renumber_map = m_graph.renumber_map
-            else:
-                msg = (
-                    "Graph can be initialized using MultiGraph "
-                    "and DiGraph can be initialized using MultiDiGraph"
-                )
-                raise Exception(msg)
-        # self.number_of_vertices = None
-
-    def enable_batch(self):
-        client = mg_utils.get_client()
-        comms = Comms.get_comms()
-
-        if client is None or comms is None:
-            msg = (
-                "MG Batch needs a Dask Client and the "
-                "Communicator needs to be initialized."
-            )
-            raise Exception(msg)
-
-        self.batch_enabled = True
-
-        if self.edgelist is not None:
-            if self.batch_edgelists is None:
-                self._replicate_edgelist()
-
-        if self.adjlist is not None:
-            if self.batch_adjlists is None:
-                self._replicate_adjlist()
-
-        if self.transposedadjlist is not None:
-            if self.batch_transposed_adjlists is None:
-                self._replicate_transposed_adjlist()
-
-    def _replicate_edgelist(self):
-        client = mg_utils.get_client()
-        comms = Comms.get_comms()
-
-        # FIXME: There  might be a better way to control it
-        if client is None:
-            return
-        work_futures = replication.replicate_cudf_dataframe(
-            self.edgelist.edgelist_df, client=client, comms=comms
-        )
-
-        self.batch_edgelists = work_futures
-
-    def _replicate_adjlist(self):
-        client = mg_utils.get_client()
-        comms = Comms.get_comms()
-
-        # FIXME: There  might be a better way to control it
-        if client is None:
-            return
-
-        weights = None
-        offsets_futures = replication.replicate_cudf_series(
-            self.adjlist.offsets, client=client, comms=comms
-        )
-        indices_futures = replication.replicate_cudf_series(
-            self.adjlist.indices, client=client, comms=comms
-        )
-
-        if self.adjlist.weights is not None:
-            weights = replication.replicate_cudf_series(self.adjlist.weights)
-        else:
-            weights = {worker: None for worker in offsets_futures}
-
-        merged_futures = {
-            worker: [
-                offsets_futures[worker],
-                indices_futures[worker],
-                weights[worker],
-            ]
-            for worker in offsets_futures
-        }
-        self.batch_adjlists = merged_futures
-
-    # FIXME: Not implemented yet
-    def _replicate_transposed_adjlist(self):
-        self.batch_transposed_adjlists = True
-
-    def clear(self):
-        """
-        Empty this graph. This function is added for NetworkX compatibility.
-        """
-        self.edgelist = None
-        self.adjlist = None
-        self.transposedadjlist = None
-
-        self.batch_edgelists = None
-        self.batch_adjlists = None
-        self.batch_transposed_adjlists = None
-
-    def add_nodes_from(self, nodes, bipartite=None, multipartite=None):
-        """
-        Add nodes information to the Graph.
-
-        Parameters
-        ----------
-        nodes : list or cudf.Series
-            The nodes of the graph to be stored. If bipartite and multipartite
-            arguments are not passed, the nodes are considered to be a list of
-            all the nodes present in the Graph.
-        bipartite : str
-            Sets the Graph as bipartite. The nodes are stored as a set of nodes
-            of the partition named as bipartite argument.
-        multipartite : str
-            Sets the Graph as multipartite. The nodes are stored as a set of
-            nodes of the partition named as multipartite argument.
-        """
-        if bipartite is None and multipartite is None:
-            self._nodes["all_nodes"] = cudf.Series(nodes)
-        else:
-            set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
-            if multipartite is not None:
-                if self.bipartite:
-                    raise Exception(
-                        "The Graph is already set as bipartite. "
-                        "Use bipartite option instead."
-                    )
-                self.multipartite = True
-            elif bipartite is not None:
-                if self.multipartite:
-                    raise Exception(
-                        "The Graph is set as multipartite. "
-                        "Use multipartite option instead."
-                    )
-                self.bipartite = True
-                multipartite = bipartite
-                if multipartite not in set_names and len(set_names) == 2:
-                    raise Exception(
-                        "The Graph is set as bipartite and "
-                        "already has two partitions initialized."
-                    )
-            self._nodes[multipartite] = cudf.Series(nodes)
-
-    def is_bipartite(self):
-        """
-        Checks if Graph is bipartite. This solely relies on the user call of
-        add_nodes_from with the bipartite parameter. This does not parse the
-        graph to check if it is bipartite.
-        """
-        # TO DO: Call coloring algorithm
-        return self.bipartite
-
-    def is_multipartite(self):
-        """
-        Checks if Graph is multipartite. This solely relies on the user call
-        of add_nodes_from with the partition parameter. This does not parse
-        the graph to check if it is multipartite.
-        """
-        # TO DO: Call coloring algorithm
-        return self.multipartite or self.bipartite
-
-    def sets(self):
-        """
-        Returns the bipartite set of nodes. This solely relies on the user's
-        call of add_nodes_from with the bipartite parameter. This does not
-        parse the graph to compute bipartite sets. If bipartite argument was
-        not provided during add_nodes_from(), it raise an exception that the
-        graph is not bipartite.
-        """
-        # TO DO: Call coloring algorithm
-        set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
-        if self.bipartite:
-            top = self._nodes[set_names[0]]
-            if len(set_names) == 2:
-                bottom = self._nodes[set_names[1]]
-            else:
-                bottom = cudf.Series(
-                    set(self.nodes().values_host) - set(top.values_host)
-                )
-            return top, bottom
-        else:
-            return {k: self._nodes[k] for k in set_names}
-
-    def from_cudf_edgelist(
-        self,
-        input_df,
-        source="source",
-        destination="destination",
-        edge_attr=None,
-        renumber=True,
-    ):
-        """
-        Initialize a graph from the edge list. It is an error to call this
-        method on an initialized Graph object. The passed input_df argument
-        wraps gdf_column objects that represent a graph using the edge list
-        format. source argument is source column name and destination argument
-        is destination column name.
-
-        By default, renumbering is enabled to map the source and destination
-        vertices into an index in the range [0, V) where V is the number
-        of vertices.  If the input vertices are a single column of integers
-        in the range [0, V), renumbering can be disabled and the original
-        external vertex ids will be used.
-
-        If weights are present, edge_attr argument is the weights column name.
-
-        Parameters
-        ----------
-        input_df : cudf.DataFrame or dask_cudf.DataFrame
-            A DataFrame that contains edge information
-            If a dask_cudf.DataFrame is passed it will be reinterpreted as
-            a cudf.DataFrame. For the distributed path please use
-            from_dask_cudf_edgelist.
-        source : str or array-like
-            source column name or array of column names
-        destination : str or array-like
-            destination column name or array of column names
-        edge_attr : str or None
-            the weights column name. Default is None
-        renumber : bool
-            Indicate whether or not to renumber the source and destination
-            vertex IDs. Default is True.
-
-        Examples
-        --------
-        >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(df, source='0', destination='1',
-                                 edge_attr='2', renumber=False)
-
-        """
-        if self.edgelist is not None or self.adjlist is not None:
-            raise Exception("Graph already has values")
-
-        s_col = source
-        d_col = destination
-        if not isinstance(s_col, list):
-            s_col = [s_col]
-        if not isinstance(d_col, list):
-            d_col = [d_col]
-        if not (
-            set(s_col).issubset(set(input_df.columns))
-            and set(d_col).issubset(set(input_df.columns))
-        ):
-            raise Exception(
-                "source column names and/or destination column "
-                "names not found in input. Recheck the source and "
-                "destination parameters"
-            )
-
-        # FIXME: update for smaller GPUs
-        # Consolidation
-        if isinstance(input_df, cudf.DataFrame):
-            if len(input_df[source]) > 2147483100:
-                raise Exception(
-                    "cudf dataFrame edge list is too big "
-                    "to fit in a single GPU"
-                )
-            elist = input_df
-        elif isinstance(input_df, dask_cudf.DataFrame):
-            if len(input_df[source]) > 2147483100:
-                raise Exception(
-                    "dask_cudf dataFrame edge list is too big "
-                    "to fit in a single GPU"
-                )
-            elist = input_df.compute().reset_index(drop=True)
-        else:
-            raise Exception(
-                "input should be a cudf.DataFrame or "
-                "a dask_cudf dataFrame"
-            )
-
-        renumber_map = None
-        if renumber:
-            # FIXME: Should SG do lazy evaluation like MG?
-            elist, renumber_map = NumberMap.renumber(
-                elist, source, destination, store_transposed=False
-            )
-            source = "src"
-            destination = "dst"
-            self.renumbered = True
-            self.renumber_map = renumber_map
-        else:
-            if type(source) is list and type(destination) is list:
-                raise Exception("set renumber to True for multi column ids")
-
-        if (elist[source] == elist[destination]).any():
-            self.self_loop = True
-        source_col = elist[source]
-        dest_col = elist[destination]
-
-        if self.multi:
-            if type(edge_attr) is not list:
-                raise Exception("edge_attr should be a list of column names")
-            value_col = {}
-            for col_name in edge_attr:
-                value_col[col_name] = elist[col_name]
-        elif edge_attr is not None:
-            value_col = elist[edge_attr]
-        else:
-            value_col = None
-
-        if not self.symmetrized and not self.multi:
-            if value_col is not None:
-                source_col, dest_col, value_col = symmetrize(
-                    source_col, dest_col, value_col
-                )
-            else:
-                source_col, dest_col = symmetrize(source_col, dest_col)
-
-        self.edgelist = Graph.EdgeList(source_col, dest_col, value_col)
-
-        if self.batch_enabled:
-            self._replicate_edgelist()
-
-        self.renumber_map = renumber_map
-
-    def from_pandas_edgelist(
-        self,
-        pdf,
-        source="source",
-        destination="destination",
-        edge_attr=None,
-        renumber=True,
-    ):
-        """
-        Initialize a graph from the edge list. It is an error to call this
-        method on an initialized Graph object. Source argument is source
-        column name and destination argument is destination column name.
-
-        By default, renumbering is enabled to map the source and destination
-        vertices into an index in the range [0, V) where V is the number
-        of vertices.  If the input vertices are a single column of integers
-        in the range [0, V), renumbering can be disabled and the original
-        external vertex ids will be used.
-
-        If weights are present, edge_attr argument is the weights column name.
-
-        Parameters
-        ----------
-        input_df : pandas.DataFrame
-            A DataFrame that contains edge information
-        source : str or array-like
-            source column name or array of column names
-        destination : str or array-like
-            destination column name or array of column names
-        edge_attr : str or None
-            the weights column name. Default is None
-        renumber : bool
-            Indicate whether or not to renumber the source and destination
-            vertex IDs. Default is True.
-
-        Examples
-        --------
-        >>> df = pandas.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_pandas_edgelist(df, source='0', destination='1',
-                                 edge_attr='2', renumber=False)
-
-        """
-        gdf = cudf.DataFrame.from_pandas(pdf)
-        self.from_cudf_edgelist(gdf, source=source, destination=destination,
-                                edge_attr=edge_attr, renumber=renumber)
-
-    def to_pandas_edgelist(self, source='source', destination='destination'):
-        """
-        Returns the graph edge list as a Pandas DataFrame.
-
-        Parameters
-        ----------
-        source : str or array-like
-            source column name or array of column names
-        destination : str or array-like
-            destination column name or array of column names
-
-        Returns
-        -------
-        df : pandas.DataFrame
-        """
-
-        gdf = self.view_edge_list()
-        return gdf.to_pandas()
-
-    def from_pandas_adjacency(self, pdf):
-        """
-        Initializes the graph from pandas adjacency matrix
-        """
-        np_array = pdf.to_numpy()
-        columns = pdf.columns
-        self.from_numpy_array(np_array, columns)
-
-    def to_pandas_adjacency(self):
-        """
-        Returns the graph adjacency matrix as a Pandas DataFrame.
-        """
-
-        np_array_data = self.to_numpy_array()
-        pdf = pd.DataFrame(np_array_data)
-        if self.renumbered:
-            nodes = self.renumber_map.implementation.df['0'].\
-                    values_host.tolist()
-        pdf.columns = nodes
-        pdf.index = nodes
-        return pdf
-
-    def to_numpy_array(self):
-        """
-        Returns the graph adjacency matrix as a NumPy array.
-        """
-
-        nlen = self.number_of_nodes()
-        elen = self.number_of_edges()
-        df = self.edgelist.edgelist_df
-        np_array = np.full((nlen, nlen), 0.0)
-        for i in range(0, elen):
-            np_array[df['src'].iloc[i], df['dst'].iloc[i]] = df['weights'].\
-                                                             iloc[i]
-        return np_array
-
-    def to_numpy_matrix(self):
-        """
-        Returns the graph adjacency matrix as a NumPy matrix.
-        """
-        np_array = self.to_numpy_array()
-        return np.asmatrix(np_array)
-
-    def from_numpy_array(self, np_array, nodes=None):
-        """
-        Initializes the graph from numpy array containing adjacency matrix.
-        """
-        src, dst = np_array.nonzero()
-        weight = np_array[src, dst]
-        df = cudf.DataFrame()
-        if nodes is not None:
-            df['src'] = nodes[src]
-            df['dst'] = nodes[dst]
-        else:
-            df['src'] = src
-            df['dst'] = dst
-        df['weight'] = weight
-        self.from_cudf_edgelist(df, 'src', 'dst', edge_attr='weight')
-
-    def from_numpy_matrix(self, np_matrix):
-        """
-        Initializes the graph from numpy matrix containing adjacency matrix.
-        """
-        np_array = np.asarray(np_matrix)
-        self.from_numpy_array(np_array)
-
-    def from_dask_cudf_edgelist(
-        self,
-        input_ddf,
-        source="source",
-        destination="destination",
-        edge_attr=None,
-        renumber=True,
-    ):
-        """
-        Initializes the distributed graph from the dask_cudf.DataFrame
-        edgelist. Undirected Graphs are not currently supported.
-
-        By default, renumbering is enabled to map the source and destination
-        vertices into an index in the range [0, V) where V is the number
-        of vertices.  If the input vertices are a single column of integers
-        in the range [0, V), renumbering can be disabled and the original
-        external vertex ids will be used.
-
-        Note that the graph object will store a reference to the
-        dask_cudf.DataFrame provided.
-
-        Parameters
-        ----------
-        input_ddf : dask_cudf.DataFrame
-            The edgelist as a dask_cudf.DataFrame
-        source : str or array-like
-            source column name or array of column names
-        destination : str
-            destination column name or array of column names
-        edge_attr : str
-            weights column name.
-        renumber : bool
-            If source and destination indices are not in range 0 to V where V
-            is number of vertices, renumber argument should be True.
-        """
-        if self.edgelist is not None or self.adjlist is not None:
-            raise Exception("Graph already has values")
-        if not isinstance(input_ddf, dask_cudf.DataFrame):
-            raise Exception("input should be a dask_cudf dataFrame")
-        if type(self) is Graph:
-            raise Exception("Undirected distributed graph not supported")
-
-        s_col = source
-        d_col = destination
-        if not isinstance(s_col, list):
-            s_col = [s_col]
-        if not isinstance(d_col, list):
-            d_col = [d_col]
-        if not (
-            set(s_col).issubset(set(input_ddf.columns))
-            and set(d_col).issubset(set(input_ddf.columns))
-        ):
-            raise Exception(
-                "source column names and/or destination column "
-                "names not found in input. Recheck the source "
-                "and destination parameters"
-            )
-        ddf_columns = s_col + d_col
-        if edge_attr is not None:
-            if not (set([edge_attr]).issubset(set(input_ddf.columns))):
-                raise Exception(
-                    "edge_attr column name not found in input."
-                    "Recheck the edge_attr parameter")
-            ddf_columns = ddf_columns + [edge_attr]
-        input_ddf = input_ddf[ddf_columns]
-
-        if edge_attr is not None:
-            input_ddf = input_ddf.rename(columns={edge_attr: 'value'})
-
-        #
-        # Keep all of the original parameters so we can lazily
-        # evaluate this function
-        #
-
-        # FIXME: Edge Attribute not handled
-        self.distributed = True
-        self.local_data = None
-        self.edgelist = None
-        self.adjlist = None
-        self.renumbered = renumber
-        self.input_df = input_ddf
-        self.source_columns = source
-        self.destination_columns = destination
-        self.store_tranposed = None
-
-    def compute_local_data(self, by, load_balance=True):
-        """
-        Compute the local edges, vertices and offsets for a distributed
-        graph stored as a dask-cudf dataframe and initialize the
-        communicator. Performs global sorting and load_balancing.
-
-        Parameters
-        ----------
-        by : str
-            by argument is the column by which we want to sort and
-            partition. It should be the source column name for generating
-            CSR format and destination column name for generating CSC
-            format.
-        load_balance : bool
-            Set as True to perform load_balancing after global sorting of
-            dask-cudf DataFrame. This ensures that the data is uniformly
-            distributed among multiple GPUs to avoid over-loading.
-        """
-        if self.distributed:
-            data = get_local_data(self, by, load_balance)
-            self.local_data = {}
-            self.local_data["data"] = data
-            self.local_data["by"] = by
-        else:
-            raise Exception("Graph should be a distributed graph")
-
-    def view_edge_list(self):
-        """
-        Display the edge list. Compute it if needed.
-
-        NOTE: If the graph is of type Graph() then the displayed undirected
-        edges are the same as displayed by networkx Graph(), but the direction
-        could be different i.e. an edge displayed by cugraph as (src, dst)
-        could be displayed as (dst, src) by networkx.
-
-        cugraph.Graph stores symmetrized edgelist internally. For displaying
-        undirected edgelist for a Graph the upper trianglar matrix of the
-        symmetrized edgelist is returned.
-
-        networkx.Graph renumbers the input and stores the upper triangle of
-        this renumbered input. Since the internal renumbering of networx and
-        cugraph is different, the upper triangular matrix of networkx
-        renumbered input may not be the same as cugraph's upper trianglar
-        matrix of the symmetrized edgelist. Hence the displayed source and
-        destination pairs in both will represent the same edge but node values
-        could be swapped.
-
-        Returns
-        -------
-        df : cudf.DataFrame
-            This cudf.DataFrame wraps source, destination and weight
-
-            df[src] : cudf.Series
-                contains the source index for each edge
-            df[dst] : cudf.Series
-                contains the destination index for each edge
-            df[weight] : cusd.Series
-                Column is only present for weighted Graph,
-                then containing the weight value for each edge
-        """
-        if self.distributed:
-            if self.edgelist is None:
-                raise Exception("Graph has no Edgelist.")
-            return self.edgelist.edgelist_df
-        if self.edgelist is None:
-            src, dst, weights = graph_primtypes_wrapper.view_edge_list(self)
-            self.edgelist = self.EdgeList(src, dst, weights)
-
-        edgelist_df = self.edgelist.edgelist_df
-
-        if self.renumbered:
-            edgelist_df = self.unrenumber(edgelist_df, "src")
-            edgelist_df = self.unrenumber(edgelist_df, "dst")
-
-        if type(self) is Graph:
-            edgelist_df = edgelist_df[edgelist_df["src"] <= edgelist_df["dst"]]
-            edgelist_df = edgelist_df.reset_index(drop=True)
-            self.edge_count = len(edgelist_df)
-
-        return edgelist_df
-
-    def delete_edge_list(self):
-        """
-        Delete the edge list.
-        """
-        # decrease reference count to free memory if the referenced objects are
-        # no longer used.
-        self.edgelist = None
-
-    def from_cudf_adjlist(self, offset_col, index_col, value_col=None):
-        """
-        Initialize a graph from the adjacency list. It is an error to call this
-        method on an initialized Graph object. The passed offset_col and
-        index_col arguments wrap gdf_column objects that represent a graph
-        using the adjacency list format.
-        If value_col is None, an unweighted graph is created. If value_col is
-        not None, a weighted graph is created.
-        Undirected edges must be stored as directed edges in both directions.
-
-        Parameters
-        ----------
-        offset_col : cudf.Series
-            This cudf.Series wraps a gdf_column of size V + 1 (V: number of
-            vertices).
-            The gdf column contains the offsets for the vertices in this graph.
-            Offsets must be in the range [0, E] (E: number of edges).
-        index_col : cudf.Series
-            This cudf.Series wraps a gdf_column of size E (E: number of edges).
-            The gdf column contains the destination index for each edge.
-            Destination indices must be in the range [0, V) (V: number of
-            vertices).
-        value_col : cudf.Series, optional
-            This pointer can be ``None``.
-            If not, this cudf.Series wraps a gdf_column of size E (E: number of
-            edges).
-            The gdf column contains the weight value for each edge.
-            The expected type of the gdf_column element is floating point
-            number.
-
-        Examples
-        --------
-        >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> M = gdf.to_pandas()
-        >>> M = scipy.sparse.coo_matrix((M['2'],(M['0'],M['1'])))
-        >>> M = M.tocsr()
-        >>> offsets = cudf.Series(M.indptr)
-        >>> indices = cudf.Series(M.indices)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_adjlist(offsets, indices, None)
-
-        """
-        if self.edgelist is not None or self.adjlist is not None:
-            raise Exception("Graph already has values")
-        self.adjlist = Graph.AdjList(offset_col, index_col, value_col)
-
-        if self.batch_enabled:
-            self._replicate_adjlist()
-
-    def compute_renumber_edge_list(self, transposed=False):
-        """
-        Compute a renumbered edge list
-
-        This function works in the MNMG pipeline and will transform
-        the input dask_cudf.DataFrame into a renumbered edge list
-        in the prescribed direction.
-
-        This function will be called by the algorithms to ensure
-        that the graph is renumbered properly.  The graph object will
-        cache the most recent renumbering attempt.  For benchmarking
-        purposes, this function can be called prior to calling a
-        graph algorithm so we can measure the cost of computing
-        the renumbering separately from the cost of executing the
-        algorithm.
-
-        When creating a CSR-like structure, set transposed to False.
-        When creating a CSC-like structure, set transposed to True.
-
-        Parameters
-        ----------
-        transposed : (optional) bool
-            If True, renumber with the intent to make a CSC-like
-            structure.  If False, renumber with the intent to make
-            a CSR-like structure.  Defaults to False.
-        """
-        # FIXME:  What to do about edge_attr???
-        #         currently ignored for MNMG
-
-        if not self.distributed:
-            raise Exception(
-                "compute_renumber_edge_list should only be used "
-                "for distributed graphs"
-            )
-
-        if not self.renumbered:
-            self.edgelist = self.EdgeList(self.input_df)
-            self.renumber_map = None
-        else:
-            if self.edgelist is not None:
-                if type(self) is Graph:
-                    return
-
-                if self.store_transposed == transposed:
-                    return
-
-                del self.edgelist
-
-            renumbered_ddf, number_map = NumberMap.renumber(
-                self.input_df,
-                self.source_columns,
-                self.destination_columns,
-                store_transposed=transposed,
-            )
-            self.edgelist = self.EdgeList(renumbered_ddf)
-            self.renumber_map = number_map
-            self.store_transposed = transposed
-
-    def view_adj_list(self):
-        """
-        Display the adjacency list. Compute it if needed.
-
-        Returns
-        -------
-        offset_col : cudf.Series
-            This cudf.Series wraps a gdf_column of size V + 1 (V: number of
-            vertices).
-            The gdf column contains the offsets for the vertices in this graph.
-            Offsets are in the range [0, E] (E: number of edges).
-        index_col : cudf.Series
-            This cudf.Series wraps a gdf_column of size E (E: number of edges).
-            The gdf column contains the destination index for each edge.
-            Destination indices are in the range [0, V) (V: number of
-            vertices).
-        value_col : cudf.Series or ``None``
-            This pointer is ``None`` for unweighted graphs.
-            For weighted graphs, this cudf.Series wraps a gdf_column of size E
-            (E: number of edges).
-            The gdf column contains the weight value for each edge.
-            The expected type of the gdf_column element is floating point
-            number.
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-
-        if self.adjlist is None:
-            if self.transposedadjlist is not None and type(self) is Graph:
-                off, ind, vals = (
-                    self.transposedadjlist.offsets,
-                    self.transposedadjlist.indices,
-                    self.transposedadjlist.weights,
-                )
-            else:
-                off, ind, vals = graph_primtypes_wrapper.view_adj_list(self)
-            self.adjlist = self.AdjList(off, ind, vals)
-
-            if self.batch_enabled:
-                self._replicate_adjlist()
-
-        return self.adjlist.offsets, self.adjlist.indices, self.adjlist.weights
-
-    def view_transposed_adj_list(self):
-        """
-        Display the transposed adjacency list. Compute it if needed.
-
-        Returns
-        -------
-        offset_col : cudf.Series
-            This cudf.Series wraps a gdf_column of size V + 1 (V: number of
-            vertices).
-            The gdf column contains the offsets for the vertices in this graph.
-            Offsets are in the range [0, E] (E: number of edges).
-        index_col : cudf.Series
-            This cudf.Series wraps a gdf_column of size E (E: number of edges).
-            The gdf column contains the destination index for each edge.
-            Destination indices are in the range [0, V) (V: number of
-            vertices).
-        value_col : cudf.Series or ``None``
-            This pointer is ``None`` for unweighted graphs.
-            For weighted graphs, this cudf.Series wraps a gdf_column of size E
-            (E: number of edges).
-            The gdf column contains the weight value for each edge.
-            The expected type of the gdf_column element is floating point
-            number.
-
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        if self.transposedadjlist is None:
-            if self.adjlist is not None and type(self) is Graph:
-                off, ind, vals = (
-                    self.adjlist.offsets,
-                    self.adjlist.indices,
-                    self.adjlist.weights,
-                )
-            else:
-                (
-                    off,
-                    ind,
-                    vals,
-                ) = graph_primtypes_wrapper.view_transposed_adj_list(self)
-            self.transposedadjlist = self.transposedAdjList(off, ind, vals)
-
-            if self.batch_enabled:
-                self._replicate_transposed_adjlist()
-
-        return (
-            self.transposedadjlist.offsets,
-            self.transposedadjlist.indices,
-            self.transposedadjlist.weights,
-        )
-
-    def delete_adj_list(self):
-        """
-        Delete the adjacency list.
-        """
-        self.adjlist = None
-
-    def get_two_hop_neighbors(self):
-        """
-        Compute vertex pairs that are two hops apart. The resulting pairs are
-        sorted before returning.
-
-        Returns
-        -------
-        df : cudf.DataFrame
-            df[first] : cudf.Series
-                the first vertex id of a pair, if an external vertex id
-                is defined by only one column
-            df[second] : cudf.Series
-                the second vertex id of a pair, if an external vertex id
-                is defined by only one column
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        df = graph_primtypes_wrapper.get_two_hop_neighbors(self)
-        if self.renumbered is True:
-            df = self.unrenumber(df, "first")
-            df = self.unrenumber(df, "second")
-
-        return df
-
-    def number_of_vertices(self):
-        """
-        Get the number of nodes in the graph.
-
-        """
-        if self.node_count is None:
-            if self.distributed:
-                if self.edgelist is not None:
-                    ddf = self.edgelist.edgelist_df[["src", "dst"]]
-                    self.node_count = ddf.max().max().compute() + 1
-                else:
-                    raise Exception("Graph is Empty")
-            elif self.adjlist is not None:
-                self.node_count = len(self.adjlist.offsets) - 1
-            elif self.transposedadjlist is not None:
-                self.node_count = len(self.transposedadjlist.offsets) - 1
-            elif self.edgelist is not None:
-                df = self.edgelist.edgelist_df[["src", "dst"]]
-                self.node_count = df.max().max() + 1
-            else:
-                raise Exception("Graph is Empty")
-        return self.node_count
-
-    def number_of_nodes(self):
-        """
-        An alias of number_of_vertices(). This function is added for NetworkX
-        compatibility.
-
-        """
-        return self.number_of_vertices()
-
-    def number_of_edges(self, directed_edges=False):
-        """
-        Get the number of edges in the graph.
-
-        """
-        if self.distributed:
-            if self.edgelist is not None:
-                return len(self.edgelist.edgelist_df)
-            else:
-                raise ValueError("Graph is Empty")
-        if directed_edges and self.edgelist is not None:
-            return len(self.edgelist.edgelist_df)
-        if self.edge_count is None:
-            if self.edgelist is not None:
-                if type(self) is Graph:
-                    self.edge_count = len(
-                        self.edgelist.edgelist_df[
-                            self.edgelist.edgelist_df["src"]
-                            >= self.edgelist.edgelist_df["dst"]
-                        ]
-                    )
-                else:
-                    self.edge_count = len(self.edgelist.edgelist_df)
-            elif self.adjlist is not None:
-                self.edge_count = len(self.adjlist.indices)
-            elif self.transposedadjlist is not None:
-                self.edge_count = len(self.transposedadjlist.indices)
-            else:
-                raise ValueError("Graph is Empty")
-        return self.edge_count
-
-    def in_degree(self, vertex_subset=None):
-        """
-        Compute vertex in-degree. Vertex in-degree is the number of edges
-        pointing into the vertex. By default, this method computes vertex
-        degrees for the entire set of vertices. If vertex_subset is provided,
-        this method optionally filters out all but those listed in
-        vertex_subset.
-
-        Parameters
-        ----------
-        vertex_subset : cudf.Series or iterable container, optional
-            A container of vertices for displaying corresponding in-degree.
-            If not set, degrees are computed for the entire set of vertices.
-
-        Returns
-        -------
-        df : cudf.DataFrame
-            GPU DataFrame of size N (the default) or the size of the given
-            vertices (vertex_subset) containing the in_degree. The ordering is
-            relative to the adjacency list, or that given by the specified
-            vertex_subset.
-
-            df[vertex] : cudf.Series
-                The vertex IDs (will be identical to vertex_subset if
-                specified).
-            df[degree] : cudf.Series
-                The computed in-degree of the corresponding vertex.
-
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(M, '0', '1')
-        >>> df = G.in_degree([0,9,12])
-
-        """
-        return self._degree(vertex_subset, x=1)
-
-    def out_degree(self, vertex_subset=None):
-        """
-        Compute vertex out-degree. Vertex out-degree is the number of edges
-        pointing out from the vertex. By default, this method computes vertex
-        degrees for the entire set of vertices. If vertex_subset is provided,
-        this method optionally filters out all but those listed in
-        vertex_subset.
-
-        Parameters
-        ----------
-        vertex_subset : cudf.Series or iterable container, optional
-            A container of vertices for displaying corresponding out-degree.
-            If not set, degrees are computed for the entire set of vertices.
-
-        Returns
-        -------
-        df : cudf.DataFrame
-            GPU DataFrame of size N (the default) or the size of the given
-            vertices (vertex_subset) containing the out_degree. The ordering is
-            relative to the adjacency list, or that given by the specified
-            vertex_subset.
-
-            df[vertex] : cudf.Series
-                The vertex IDs (will be identical to vertex_subset if
-                specified).
-            df[degree] : cudf.Series
-                The computed out-degree of the corresponding vertex.
-
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(M, '0', '1')
-        >>> df = G.out_degree([0,9,12])
-
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        return self._degree(vertex_subset, x=2)
-
-    def degree(self, vertex_subset=None):
-        """
-        Compute vertex degree, which is the total number of edges incident
-        to a vertex (both in and out edges). By default, this method computes
-        degrees for the entire set of vertices. If vertex_subset is provided,
-        then this method optionally filters out all but those listed in
-        vertex_subset.
-
-        Parameters
-        ----------
-        vertex_subset : cudf.Series or iterable container, optional
-            a container of vertices for displaying corresponding degree. If not
-            set, degrees are computed for the entire set of vertices.
-
-        Returns
-        -------
-        df : cudf.DataFrame
-            GPU DataFrame of size N (the default) or the size of the given
-            vertices (vertex_subset) containing the degree. The ordering is
-            relative to the adjacency list, or that given by the specified
-            vertex_subset.
-
-            df['vertex'] : cudf.Series
-                The vertex IDs (will be identical to vertex_subset if
-                specified).
-            df['degree'] : cudf.Series
-                The computed degree of the corresponding vertex.
-
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(M, '0', '1')
-        >>> all_df = G.degree()
-        >>> subset_df = G.degree([0,9,12])
-
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        return self._degree(vertex_subset)
-
-    # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
-    def degrees(self, vertex_subset=None):
-        """
-        Compute vertex in-degree and out-degree. By default, this method
-        computes vertex degrees for the entire set of vertices. If
-        vertex_subset is provided, this method optionally filters out all but
-        those listed in vertex_subset.
-
-        Parameters
-        ----------
-        vertex_subset : cudf.Series or iterable container, optional
-            A container of vertices for displaying corresponding degree. If not
-            set, degrees are computed for the entire set of vertices.
-
-        Returns
-        -------
-        df : cudf.DataFrame
-            GPU DataFrame of size N (the default) or the size of the given
-            vertices (vertex_subset) containing the degrees. The ordering is
-            relative to the adjacency list, or that given by the specified
-            vertex_subset.
-
-            df['vertex'] : cudf.Series
-                The vertex IDs (will be identical to vertex_subset if
-                specified).
-            df['in_degree'] : cudf.Series
-                The in-degree of the vertex.
-            df['out_degree'] : cudf.Series
-                The out-degree of the vertex.
-
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(M, '0', '1')
-        >>> df = G.degrees([0,9,12])
-
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        (
-            vertex_col,
-            in_degree_col,
-            out_degree_col,
-        ) = graph_primtypes_wrapper._degrees(self)
-
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["in_degree"] = in_degree_col
-        df["out_degree"] = out_degree_col
-
-        if self.renumbered is True:
-            df = self.unrenumber(df, "vertex")
-
-        if vertex_subset is not None:
-            df = df[df['vertex'].isin(vertex_subset)]
-
-        return df
-
-    def _degree(self, vertex_subset, x=0):
-        vertex_col, degree_col = graph_primtypes_wrapper._degree(self, x)
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["degree"] = degree_col
-
-        if self.renumbered is True:
-            df = self.unrenumber(df, "vertex")
-
-        if vertex_subset is not None:
-            df = df[df['vertex'].isin(vertex_subset)]
-
-        return df
-
-    def to_directed(self):
-        """
-        Return a directed representation of the graph.
-        This function sets the type of graph as DiGraph() and returns the
-        directed view.
-
-        Returns
-        -------
-        G : DiGraph
-            A directed graph with the same nodes, and each edge (u,v,weights)
-            replaced by two directed edges (u,v,weights) and (v,u,weights).
-
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(M, '0', '1')
-        >>> DiG = G.to_directed()
-
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        if type(self) is DiGraph:
-            return self
-        if type(self) is Graph:
-            DiG = DiGraph()
-            DiG.renumbered = self.renumbered
-            DiG.renumber_map = self.renumber_map
-            DiG.edgelist = self.edgelist
-            DiG.adjlist = self.adjlist
-            DiG.transposedadjlist = self.transposedadjlist
-            return DiG
-
-    def to_undirected(self):
-        """
-        Return an undirected copy of the graph.
-
-        Returns
-        -------
-        G : Graph
-            A undirected graph with the same nodes, and each directed edge
-            (u,v,weights) replaced by an undirected edge (u,v,weights).
-
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> DiG = cugraph.DiGraph()
-        >>> DiG.from_cudf_edgelist(M, '0', '1')
-        >>> G = DiG.to_undirected()
-
-        """
-
-        if type(self) is Graph:
-            return self
-        if type(self) is DiGraph:
-            G = Graph()
-            df = self.edgelist.edgelist_df
-            G.renumbered = self.renumbered
-            G.renumber_map = self.renumber_map
-            G.multi = self.multi
-            if self.edgelist.weights:
-                source_col, dest_col, value_col = symmetrize(
-                    df["src"], df["dst"], df["weights"]
-                )
-            else:
-                source_col, dest_col = symmetrize(df["src"], df["dst"])
-                value_col = None
-            G.edgelist = Graph.EdgeList(source_col, dest_col, value_col)
-
-            return G
-
-    def is_directed(self):
-        if type(self) is DiGraph:
-            return True
-        else:
-            return False
-
-    def has_node(self, n):
-        """
-        Returns True if the graph contains the node n.
-        """
-        if self.edgelist is None:
-            raise Exception("Graph has no Edgelist.")
-        if self.distributed:
-            ddf = self.edgelist.edgelist_df[["src", "dst"]]
-            return (ddf == n).any().any().compute()
-        if self.renumbered:
-            tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
-            return tmp[0] is not cudf.NA and tmp[0] >= 0
-        else:
-            df = self.edgelist.edgelist_df[["src", "dst"]]
-            return (df == n).any().any()
-
-    def has_edge(self, u, v):
-        """
-        Returns True if the graph contains the edge (u,v).
-        """
-        if self.edgelist is None:
-            raise Exception("Graph has no Edgelist.")
-        if self.renumbered:
-            tmp = cudf.DataFrame({"src": [u, v]})
-            tmp = tmp.astype({"src": "int"})
-            tmp = self.add_internal_vertex_id(
-                tmp, "id", "src", preserve_order=True
-            )
-
-            u = tmp["id"][0]
-            v = tmp["id"][1]
-
-        df = self.edgelist.edgelist_df
-        if self.distributed:
-            return ((df["src"] == u) & (df["dst"] == v)).any().compute()
-        return ((df["src"] == u) & (df["dst"] == v)).any()
-
-    def edges(self):
-        """
-        Returns all the edges in the graph as a cudf.DataFrame containing
-        sources and destinations. It does not return the edge weights.
-        For viewing edges with weights use view_edge_list()
-        """
-        return self.view_edge_list()[["src", "dst"]]
-
-    def nodes(self):
-        """
-        Returns all the nodes in the graph as a cudf.Series
-        """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
-        if self.edgelist is not None:
-            df = self.edgelist.edgelist_df
-            if self.renumbered:
-                # FIXME: If vertices are multicolumn
-                #        this needs to return a dataframe
-                # FIXME: This relies on current implementation
-                #        of NumberMap, should not really expose
-                #        this, perhaps add a method to NumberMap
-                return self.renumber_map.implementation.df["0"]
-            else:
-                return cudf.concat([df["src"], df["dst"]]).unique()
-        if "all_nodes" in self._nodes.keys():
-            return self._nodes["all_nodes"]
-        else:
-            n = cudf.Series(dtype="int")
-            set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
-            for k in set_names:
-                n = n.append(self._nodes[k])
-            return n
-
-    def neighbors(self, n):
-        if self.edgelist is None:
-            raise Exception("Graph has no Edgelist.")
-        if self.distributed:
-            ddf = self.edgelist.edgelist_df
-            return ddf[ddf["src"] == n]["dst"].reset_index(drop=True)
-        if self.renumbered:
-            node = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
-            if len(node) == 0:
-                return cudf.Series(dtype="int")
-            n = node[0]
-
-        df = self.edgelist.edgelist_df
-        neighbors = df[df["src"] == n]["dst"].reset_index(drop=True)
-        if self.renumbered:
-            # FIXME:  Multi-column vertices
-            return self.renumber_map.from_internal_vertex_id(neighbors)["0"]
-        else:
-            return neighbors
-
-    def unrenumber(self, df, column_name, preserve_order=False):
-        """
-        Given a DataFrame containing internal vertex ids in the identified
-        column, replace this with external vertex ids.  If the renumbering
-        is from a single column, the output dataframe will use the same
-        name for the external vertex identifiers.  If the renumbering is from
-        a multi-column input, the output columns will be labeled 0 through
-        n-1 with a suffix of _column_name.
-
-        Note that this function does not guarantee order in single GPU mode,
-        and does not guarantee order or partitioning in multi-GPU mode.  If you
-        wish to preserve ordering, add an index column to df and sort the
-        return by that index column.
-
-        Parameters
-        ----------
-        df: cudf.DataFrame or dask_cudf.DataFrame
-            A DataFrame containing internal vertex identifiers that will be
-            converted into external vertex identifiers.
-
-        column_name: string
-            Name of the column containing the internal vertex id.
-
-        preserve_order: (optional) bool
-            If True, preserve the order of the rows in the output
-            DataFrame to match the input DataFrame
-
-        Returns
-        ---------
-        df : cudf.DataFrame or dask_cudf.DataFrame
-            The original DataFrame columns exist unmodified.  The external
-            vertex identifiers are added to the DataFrame, the internal
-            vertex identifier column is removed from the dataframe.
-        """
-        return self.renumber_map.unrenumber(df, column_name, preserve_order)
-
-    def lookup_internal_vertex_id(self, df, column_name=None):
-        """
-        Given a DataFrame containing external vertex ids in the identified
-        columns, or a Series containing external vertex ids, return a
-        Series with the internal vertex ids.
-
-        Note that this function does not guarantee order in single GPU mode,
-        and does not guarantee order or partitioning in multi-GPU mode.
-
-        Parameters
-        ----------
-        df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series
-            A DataFrame containing external vertex identifiers that will be
-            converted into internal vertex identifiers.
-
-        column_name: (optional) string
-            Name of the column containing the external vertex ids
-
-        Returns
-        ---------
-        series : cudf.Series or dask_cudf.Series
-            The internal vertex identifiers
-        """
-        return self.renumber_map.to_internal_vertex_id(df, column_name)
-
-    def add_internal_vertex_id(
-        self,
-        df,
-        internal_column_name,
-        external_column_name,
-        drop=True,
-        preserve_order=False,
-    ):
-        """
-        Given a DataFrame containing external vertex ids in the identified
-        columns, return a DataFrame containing the internal vertex ids as the
-        specified column name.  Optionally drop the external vertex id columns.
-        Optionally preserve the order of the original DataFrame.
-
-        Parameters
-        ----------
-        df: cudf.DataFrame or dask_cudf.DataFrame
-            A DataFrame containing external vertex identifiers that will be
-            converted into internal vertex identifiers.
-
-        internal_column_name: string
-            Name of column to contain the internal vertex id
-
-        external_column_name: string or list of strings
-            Name of the column(s) containing the external vertex ids
-
-        drop: (optional) bool, defaults to True
-            Drop the external columns from the returned DataFrame
-
-        preserve_order: (optional) bool, defaults to False
-            Preserve the order of the data frame (requires an extra sort)
-
-        Returns
-        ---------
-        df : cudf.DataFrame or dask_cudf.DataFrame
-            Original DataFrame with new column containing internal vertex
-            id
-        """
-        return self.renumber_map.add_internal_vertex_id(
-            df,
-            internal_column_name,
-            external_column_name,
-            drop,
-            preserve_order,
-        )
-
-
-class DiGraph(Graph):
-    def __init__(self, m_graph=None, edge_attr=None):
-        super().__init__(
-            m_graph=m_graph, edge_attr=edge_attr, symmetrized=True
-        )
-
-
-class MultiGraph(Graph):
-    def __init__(self, renumbered=True):
-        super().__init__(multi=True)
-
-
-class MultiDiGraph(Graph):
-    def __init__(self, renumbered=True):
-        super().__init__(symmetrized=True, multi=True)
diff --git a/python/cugraph/structure/graph_classes.py b/python/cugraph/structure/graph_classes.py
new file mode 100644
index 00000000000..0fc8b454138
--- /dev/null
+++ b/python/cugraph/structure/graph_classes.py
@@ -0,0 +1,790 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from .graph_implementation import (simpleGraphImpl,
+                                   simpleDistributedGraphImpl,
+                                   npartiteGraphImpl)
+import cudf
+import warnings
+
+
+# TODO: Move to utilities
+def null_check(col):
+    if col.null_count != 0:
+        raise ValueError("Series contains NULL values")
+
+
+class Graph:
+    class Properties:
+        def __init__(self, directed):
+            self.directed = directed
+            self.weights = False
+
+    def __init__(self, m_graph=None, directed=False):
+        self._Impl = None
+        self.graph_properties = Graph.Properties(directed)
+        if m_graph is not None:
+            if m_graph.is_multigraph():
+                elist = m_graph.view_edge_list()
+                if m_graph.is_weighted():
+                    weights = "weights"
+                else:
+                    weights = None
+                self.from_cudf_edgelist(elist,
+                                        source="src",
+                                        destination="dst",
+                                        edge_attr=weights)
+            else:
+                msg = (
+                    "Graph can only be initialized using MultiGraph "
+                    "or MultiDiGraph"
+                )
+                raise Exception(msg)
+
+    def __getattr__(self, name):
+        if self._Impl is None:
+            raise AttributeError(name)
+        if hasattr(self._Impl, name):
+            return getattr(self._Impl, name)
+        # FIXME: Remove access to Impl properties
+        elif hasattr(self._Impl.properties, name):
+            return getattr(self._Impl.properties, name)
+        else:
+            raise AttributeError(name)
+
+    def __dir__(self):
+        return dir(self._Impl)
+
+    def from_cudf_edgelist(
+        self,
+        input_df,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True
+    ):
+        """
+        Initialize a graph from the edge list. It is an error to call this
+        method on an initialized Graph object. The passed input_df argument
+        wraps gdf_column objects that represent a graph using the edge list
+        format. source argument is source column name and destination argument
+        is destination column name.
+        By default, renumbering is enabled to map the source and destination
+        vertices into an index in the range [0, V) where V is the number
+        of vertices.  If the input vertices are a single column of integers
+        in the range [0, V), renumbering can be disabled and the original
+        external vertex ids will be used.
+        If weights are present, edge_attr argument is the weights column name.
+
+        Parameters
+        ----------
+        input_df : cudf.DataFrame or dask_cudf.DataFrame
+        A DataFrame that contains edge information If a dask_cudf.DataFrame is
+        passed it will be reinterpreted as a cudf.DataFrame. For the
+        distributed path please use from_dask_cudf_edgelist.
+
+        source : str or array-like
+        source column name or array of column names
+
+        destination : str or array-like
+        destination column name or array of column names
+
+        edge_attr : str or None
+        the weights column name. Default is None
+
+        renumber : bool
+        Indicate whether or not to renumber the source and destination vertex
+        IDs. Default is True.
+
+        Examples
+        --------
+        >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(df, source='0', destination='1',
+                                 edge_attr='2', renumber=False)
+        """
+        if self._Impl is None:
+            self._Impl = simpleGraphImpl(self.graph_properties)
+        elif type(self._Impl) is not simpleGraphImpl:
+            raise Exception("Graph is already initialized")
+        elif (self._Impl.edgelist is not None or
+              self._Impl.adjlist is not None):
+            raise Exception("Graph already has values")
+        self._Impl._simpleGraphImpl__from_edgelist(input_df,
+                                                   source=source,
+                                                   destination=destination,
+                                                   edge_attr=edge_attr,
+                                                   renumber=renumber)
+
+    def from_cudf_adjlist(self, offset_col, index_col, value_col=None):
+        """
+        Initialize a graph from the adjacency list. It is an error to call this
+        method on an initialized Graph object. The passed offset_col and
+        index_col arguments wrap gdf_column objects that represent a graph
+        using the adjacency list format.
+        If value_col is None, an unweighted graph is created. If value_col is
+        not None, a weighted graph is created.
+        Undirected edges must be stored as directed edges in both directions.
+        Parameters
+        ----------
+        offset_col : cudf.Series
+        This cudf.Series wraps a gdf_column of size V + 1 (V: number of
+        vertices).  The gdf column contains the offsets for the vertices in
+        this graph.  Offsets must be in the range [0, E] (E: number of edges).
+
+        index_col : cudf.Series
+        This cudf.Series wraps a gdf_column of size E (E: number of edges).
+        The gdf column contains the destination index for each edge.
+        Destination indices must be in the range [0, V)
+        (V: number of vertices).
+
+        value_col : cudf.Series, optional
+        This pointer can be ``None``.  If not, this cudf.Series wraps a
+        gdf_column of size E (E: number of edges).  The gdf column contains the
+        weight value for each edge.  The expected type of the gdf_column
+        element is floating point number.
+
+        Examples
+        --------
+        >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> M = gdf.to_pandas()
+        >>> M = scipy.sparse.coo_matrix((M['2'],(M['0'],M['1'])))
+        >>> M = M.tocsr()
+        >>> offsets = cudf.Series(M.indptr)
+        >>> indices = cudf.Series(M.indices)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_adjlist(offsets, indices, None)
+        """
+        if self._Impl is None:
+            self._Impl = simpleGraphImpl(self.graph_properties)
+        elif type(self._Impl) is not simpleGraphImpl:
+            raise Exception("Graph is already initialized")
+        elif (self._Impl.edgelist is not None or
+              self._Impl.adjlist is not None):
+            raise Exception("Graph already has values")
+        self._Impl._simpleGraphImpl__from_adjlist(offset_col,
+                                                  index_col,
+                                                  value_col)
+
+    def from_dask_cudf_edgelist(
+        self,
+        input_ddf,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+    ):
+        """
+        Initializes the distributed graph from the dask_cudf.DataFrame
+        edgelist. Undirected Graphs are not currently supported.
+        By default, renumbering is enabled to map the source and destination
+        vertices into an index in the range [0, V) where V is the number
+        of vertices.  If the input vertices are a single column of integers
+        in the range [0, V), renumbering can be disabled and the original
+        external vertex ids will be used.
+        Note that the graph object will store a reference to the
+        dask_cudf.DataFrame provided.
+
+        Parameters
+        ----------
+        input_ddf : dask_cudf.DataFrame
+        The edgelist as a dask_cudf.DataFrame
+
+        source : str or array-like
+        source column name or array of column names
+
+        destination : str
+        destination column name or array of column names
+
+        edge_attr : str
+        weights column name.
+
+        renumber : bool
+        If source and destination indices are not in range 0 to V where V is
+        number of vertices, renumber argument should be True.
+        """
+        if self._Impl is None:
+            self._Impl = simpleDistributedGraphImpl(self.graph_properties)
+        elif type(self._Impl) is not simpleDistributedGraphImpl:
+            raise Exception("Graph is already initialized")
+        elif (self._Impl.edgelist is not None):
+            raise Exception("Graph already has values")
+        self._Impl._simpleDistributedGraphImpl__from_edgelist(input_ddf,
+                                                              source,
+                                                              destination,
+                                                              edge_attr,
+                                                              renumber)
+
+    # Move to Compat Module
+    def from_pandas_edgelist(
+        self,
+        pdf,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+    ):
+        """
+        Initialize a graph from the edge list. It is an error to call this
+        method on an initialized Graph object. Source argument is source
+        column name and destination argument is destination column name.
+        By default, renumbering is enabled to map the source and destination
+        vertices into an index in the range [0, V) where V is the number
+        of vertices.  If the input vertices are a single column of integers
+        in the range [0, V), renumbering can be disabled and the original
+        external vertex ids will be used.
+        If weights are present, edge_attr argument is the weights column name.
+
+        Parameters
+        ----------
+        input_df : pandas.DataFrame
+        A DataFrame that contains edge information
+
+        source : str or array-like
+        source column name or array of column names
+
+        destination : str or array-like
+        destination column name or array of column names
+
+        edge_attr : str or None
+        the weights column name. Default is None
+
+        renumber : bool
+        Indicate whether or not to renumber the source and destination vertex
+        IDs. Default is True.
+
+        Examples
+        --------
+        >>> df = pandas.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_pandas_edgelist(df, source='0', destination='1',
+                                 edge_attr='2', renumber=False)
+        """
+        gdf = cudf.DataFrame.from_pandas(pdf)
+        self.from_cudf_edgelist(gdf, source=source, destination=destination,
+                                edge_attr=edge_attr, renumber=renumber)
+
+    def from_pandas_adjacency(self, pdf):
+        """
+        Initializes the graph from pandas adjacency matrix
+        """
+        np_array = pdf.to_numpy()
+        columns = pdf.columns
+        self.from_numpy_array(np_array, columns)
+
+    def from_numpy_array(self, np_array, nodes=None):
+        """
+        Initializes the graph from numpy array containing adjacency matrix.
+        """
+        src, dst = np_array.nonzero()
+        weight = np_array[src, dst]
+        df = cudf.DataFrame()
+        if nodes is not None:
+            df['src'] = nodes[src]
+            df['dst'] = nodes[dst]
+        else:
+            df['src'] = src
+            df['dst'] = dst
+        df['weight'] = weight
+        self.from_cudf_edgelist(df, 'src', 'dst', edge_attr='weight')
+
+    def from_numpy_matrix(self, np_matrix):
+        """
+        Initializes the graph from numpy matrix containing adjacency matrix.
+        """
+        np_array = np.asarray(np_matrix)
+        self.from_numpy_array(np_array)
+
+    def unrenumber(self, df, column_name, preserve_order=False,
+                   get_column_names=False):
+        """
+        Given a DataFrame containing internal vertex ids in the identified
+        column, replace this with external vertex ids.  If the renumbering
+        is from a single column, the output dataframe will use the same
+        name for the external vertex identifiers.  If the renumbering is from
+        a multi-column input, the output columns will be labeled 0 through
+        n-1 with a suffix of _column_name.
+        Note that this function does not guarantee order in single GPU mode,
+        and does not guarantee order or partitioning in multi-GPU mode.  If you
+        wish to preserve ordering, add an index column to df and sort the
+        return by that index column.
+
+        Parameters
+        ----------
+        df: cudf.DataFrame or dask_cudf.DataFrame
+        A DataFrame containing internal vertex identifiers that will be
+        converted into external vertex identifiers.
+
+        column_name: string
+        Name of the column containing the internal vertex id.
+
+        preserve_order: (optional) bool
+        If True, preserve the order of the rows in the output DataFrame to
+        match the input DataFrame
+
+        Returns
+        ---------
+        df : cudf.DataFrame or dask_cudf.DataFrame
+        The original DataFrame columns exist unmodified.  The external vertex
+        identifiers are added to the DataFrame, the internal vertex identifier
+        column is removed from the dataframe.
+        """
+        return self.renumber_map.unrenumber(df, column_name, preserve_order,
+                                            get_column_names)
+
+    def lookup_internal_vertex_id(self, df, column_name=None):
+        """
+        Given a DataFrame containing external vertex ids in the identified
+        columns, or a Series containing external vertex ids, return a
+        Series with the internal vertex ids.
+        Note that this function does not guarantee order in single GPU mode,
+        and does not guarantee order or partitioning in multi-GPU mode.
+
+        Parameters
+        ----------
+        df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series
+        A DataFrame containing external vertex identifiers that will be
+        converted into internal vertex identifiers.
+
+        column_name: (optional) string
+        Name of the column containing the external vertex ids
+
+        Returns
+        ---------
+        series : cudf.Series or dask_cudf.Series
+            The internal vertex identifiers
+        """
+        return self.renumber_map.to_internal_vertex_id(df, column_name)
+
+    def add_internal_vertex_id(
+        self,
+        df,
+        internal_column_name,
+        external_column_name,
+        drop=True,
+        preserve_order=False,
+    ):
+        """
+        Given a DataFrame containing external vertex ids in the identified
+        columns, return a DataFrame containing the internal vertex ids as the
+        specified column name.  Optionally drop the external vertex id columns.
+        Optionally preserve the order of the original DataFrame.
+        Parameters
+        ----------
+        df: cudf.DataFrame or dask_cudf.DataFrame
+        A DataFrame containing external vertex identifiers that will be
+        converted into internal vertex identifiers.
+
+        internal_column_name: string
+        Name of column to contain the internal vertex id
+
+        external_column_name: string or list of strings
+        Name of the column(s) containing the external vertex ids
+
+        drop: (optional) bool, defaults to True
+        Drop the external columns from the returned DataFrame
+
+        preserve_order: (optional) bool, defaults to False
+        Preserve the order of the data frame (requires an extra sort)
+
+        Returns
+        ---------
+        df : cudf.DataFrame or dask_cudf.DataFrame
+            Original DataFrame with new column containing internal vertex
+            id
+
+        """
+        return self.renumber_map.add_internal_vertex_id(
+            df,
+            internal_column_name,
+            external_column_name,
+            drop,
+            preserve_order,
+        )
+
+    def clear(self):
+        """
+        Empty the graph.
+        """
+        self._Impl = None
+
+    def is_bipartite(self):
+        """
+        Checks if Graph is bipartite. This solely relies on the user call of
+        add_nodes_from with the bipartite parameter. This does not parse the
+        graph to check if it is bipartite.
+        """
+        # TO DO: Call coloring algorithm
+        return False
+
+    def is_multipartite(self):
+        """
+        Checks if Graph is multipartite. This solely relies on the user call
+        of add_nodes_from with the partition parameter. This does not parse
+        the graph to check if it is multipartite.
+        """
+        # TO DO: Call coloring algorithm
+        return False
+
+    def is_multigraph(self):
+        """
+        Returns True if the graph is a multigraph. Else returns False.
+        """
+        # TO DO: Call coloring algorithm
+        return False
+
+    def is_directed(self):
+        """
+        Returns True if the graph is a directed graph.
+        Returns False if the graph is an undirected graph.
+        """
+        return self.graph_properties.directed
+
+    def is_renumbered(self):
+        """
+        Returns True if the graph is renumbered.
+        """
+        return self.properties.renumbered
+
+    def is_weighted(self):
+        """
+        Returns True if the graph has edge weights.
+        """
+        return self.properties.weighted
+
+    def has_isolated_vertices(self):
+        """
+        Returns True if the graph has isolated vertices.
+        """
+        return self.properties.isolated_vertices
+
+    def to_directed(self):
+        """
+        Return a directed representation of the graph.
+        This function sets the type of graph as DiGraph() and returns the
+        directed view.
+
+        Returns
+        -------
+        G : DiGraph
+        A directed graph with the same nodes, and each edge (u,v,weights)
+        replaced by two directed edges (u,v,weights) and (v,u,weights).
+
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> DiG = G.to_directed()
+        """
+
+        directed_graph = type(self)()
+        directed_graph.graph_properties.directed = True
+        directed_graph._Impl = type(self._Impl)(directed_graph.
+                                                graph_properties)
+        self._Impl.to_directed(directed_graph._Impl)
+        return directed_graph
+
+    def to_undirected(self):
+        """
+        Return an undirected copy of the graph.
+
+        Returns
+        -------
+        G : Graph
+        A undirected graph with the same nodes, and each directed edge
+        (u,v,weights) replaced by an undirected edge (u,v,weights).
+
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> DiG = cugraph.DiGraph()
+        >>> DiG.from_cudf_edgelist(M, '0', '1')
+        >>> G = DiG.to_undirected()
+        """
+
+        if self.graph_properties.directed is False:
+            undirected_graph = type(self)()
+        elif self.__class__.__bases__[0] == object:
+            undirected_graph = type(self)()
+        else:
+            undirected_graph = self.__class__.__bases__[0]()
+        undirected_graph._Impl = type(self._Impl)(undirected_graph.
+                                                  graph_properties)
+        self._Impl.to_undirected(undirected_graph._Impl)
+        return undirected_graph
+
+    def add_nodes_from(self, nodes):
+        """
+        Add nodes information to the Graph.
+        Parameters
+        ----------
+        nodes : list or cudf.Series
+        The nodes of the graph to be stored.
+        """
+        self._Impl._nodes["all_nodes"] = cudf.Series(nodes)
+
+    # TODO: Add function
+    # def properties():
+
+
+class DiGraph(Graph):
+    def __init__(self, m_graph=None):
+        warnings.warn(
+            "DiGraph is deprecated, use Graph(directed=True) instead",
+            DeprecationWarning
+        )
+        super(DiGraph, self).__init__(m_graph, directed=True)
+
+
+class MultiGraph(Graph):
+    def __init__(self, directed=False):
+        super(MultiGraph, self).__init__(directed=directed)
+        self.graph_properties.multi_edge = True
+
+    def is_multigraph(self):
+        """
+        Returns True if the graph is a multigraph. Else returns False.
+        """
+        # TO DO: Call coloring algorithm
+        return True
+
+
+class MultiDiGraph(MultiGraph):
+    def __init__(self):
+        warnings.warn(
+            "MultiDiGraph is deprecated,\
+ use MultiGraph(directed=True) instead",
+            DeprecationWarning
+        )
+        super(MultiDiGraph, self).__init__(directed=True)
+
+
+class Tree(Graph):
+    def __init__(self, directed=False):
+        super(Tree, self).__init__(directed=directed)
+        self.graph_properties.tree = True
+
+
+class NPartiteGraph(Graph):
+    def __init__(self, bipartite=False, directed=False):
+        super(NPartiteGraph, self).__init__(directed=directed)
+        self.graph_properties.bipartite = bipartite
+        self.graph_properties.multipartite = True
+
+    def from_cudf_edgelist(
+        self,
+        input_df,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True
+    ):
+        """
+        Initialize a graph from the edge list. It is an error to call this
+        method on an initialized Graph object. The passed input_df argument
+        wraps gdf_column objects that represent a graph using the edge list
+        format. source argument is source column name and destination argument
+        is destination column name.
+        By default, renumbering is enabled to map the source and destination
+        vertices into an index in the range [0, V) where V is the number
+        of vertices.  If the input vertices are a single column of integers
+        in the range [0, V), renumbering can be disabled and the original
+        external vertex ids will be used.
+        If weights are present, edge_attr argument is the weights column name.
+
+        Parameters
+        ----------
+        input_df : cudf.DataFrame or dask_cudf.DataFrame
+        A DataFrame that contains edge information. If a dask_cudf.DataFrame is
+        passed it will be reinterpreted as a cudf.DataFrame. For the
+        distributed path please use from_dask_cudf_edgelist.
+
+        source : str or array-like
+        source column name or array of column names
+
+        destination : str or array-like
+        destination column name or array of column names
+
+        edge_attr : str or None
+        the weights column name. Default is None
+
+        renumber : bool
+        Indicate whether or not to renumber the source and destination vertex
+        IDs. Default is True.
+
+        Examples
+        --------
+        >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.BiPartiteGraph()
+        >>> G.from_cudf_edgelist(df, source='0', destination='1',
+                                 edge_attr='2', renumber=False)
+        """
+        if self._Impl is None:
+            self._Impl = npartiteGraphImpl(self.graph_properties)
+        # API may change in future
+        self._Impl._npartiteGraphImpl__from_edgelist(input_df,
+                                                     source=source,
+                                                     destination=destination,
+                                                     edge_attr=edge_attr,
+                                                     renumber=renumber)
+
+    def from_dask_cudf_edgelist(
+        self,
+        input_ddf,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+    ):
+        """
+        Initializes the distributed graph from the dask_cudf.DataFrame
+        edgelist. Undirected Graphs are not currently supported.
+        By default, renumbering is enabled to map the source and destination
+        vertices into an index in the range [0, V) where V is the number
+        of vertices.  If the input vertices are a single column of integers
+        in the range [0, V), renumbering can be disabled and the original
+        external vertex ids will be used.
+        Note that the graph object will store a reference to the
+        dask_cudf.DataFrame provided.
+
+        Parameters
+        ----------
+        input_ddf : dask_cudf.DataFrame
+        The edgelist as a dask_cudf.DataFrame
+
+        source : str or array-like
+        source column name or array of column names
+
+        destination : str
+        destination column name or array of column names
+
+        edge_attr : str
+        weights column name.
+
+        renumber : bool
+        If source and destination indices are not in range 0 to V where V is
+        number of vertices, renumber argument should be True.
+        """
+        raise Exception("Distributed N-partite graph not supported")
+
+    def add_nodes_from(self, nodes, bipartite=None, multipartite=None):
+        """
+        Add nodes information to the Graph.
+        Parameters
+        ----------
+        nodes : list or cudf.Series
+            The nodes of the graph to be stored. If bipartite and multipartite
+            arguments are not passed, the nodes are considered to be a list of
+            all the nodes present in the Graph.
+        bipartite : str
+            Sets the Graph as bipartite. The nodes are stored as a set of nodes
+            of the partition named as bipartite argument.
+        multipartite : str
+            Sets the Graph as multipartite. The nodes are stored as a set of
+            nodes of the partition named as multipartite argument.
+
+        """
+        if self._Impl is None:
+            self._Impl = npartiteGraphImpl(self.graph_properties)
+        if bipartite is None and multipartite is None:
+            self._Impl._nodes["all_nodes"] = cudf.Series(nodes)
+        else:
+            self._Impl.add_nodes_from(nodes, bipartite=bipartite,
+                                      multipartite=multipartite)
+
+    def is_multipartite(self):
+        """
+        Checks if Graph is multipartite. This solely relies on the user call
+        of add_nodes_from with the partition parameter and the Graph created.
+        This does not parse the graph to check if it is multipartite.
+        """
+        return True
+
+
+class BiPartiteGraph(NPartiteGraph):
+    def __init__(self, directed=False):
+        super(BiPartiteGraph, self).__init__(directed=directed, bipartite=True)
+
+    def is_bipartite(self):
+        """
+        Checks if Graph is bipartite. This solely relies on the user call of
+        add_nodes_from with the bipartite parameter and the Graph created.
+        This does not parse the graph to check if it is bipartite.
+        """
+        return True
+
+
+class BiPartiteDiGraph(BiPartiteGraph):
+    def __init__(self):
+        warnings.warn(
+            "BiPartiteDiGraph is deprecated,\
+ use BiPartiteGraph(directed=True) instead",
+            DeprecationWarning
+        )
+        super(BiPartiteDiGraph, self).__init__(directed=True)
+
+
+class NPartiteDiGraph(NPartiteGraph):
+    def __init__(self):
+        warnings.warn(
+            "NPartiteDiGraph is deprecated,\
+ use NPartiteGraph(directed=True) instead",
+            DeprecationWarning
+        )
+        super(NPartiteGraph, self).__init__(directed=True)
+
+
+def is_directed(G):
+    """
+    Returns True if the graph is a directed graph.
+    Returns False if the graph is an undirected graph.
+    """
+    return G.is_directed()
+
+
+def is_multigraph(G):
+    """
+    Returns True if the graph is a multigraph. Else returns False.
+    """
+    return G.is_multigraph()
+
+
+def is_multipartite(G):
+    """
+    Checks if Graph is multipartite. This solely relies on the Graph
+    type. This does not parse the graph to check if it is multipartite.
+    """
+    return G.is_multipatite()
+
+
+def is_bipartite(G):
+    """
+    Checks if Graph is bipartite. This solely relies on the Graph type.
+    This does not parse the graph to check if it is bipartite.
+    """
+    return G.is_bipartite()
+
+
+def is_weighted(G):
+    """
+    Returns True if the graph has edge weights.
+    """
+    return G.is_weighted()
diff --git a/python/cugraph/structure/graph_implementation/__init__.py b/python/cugraph/structure/graph_implementation/__init__.py
new file mode 100644
index 00000000000..eeef73c0f64
--- /dev/null
+++ b/python/cugraph/structure/graph_implementation/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .simpleGraph import simpleGraphImpl
+from .simpleDistributedGraph import simpleDistributedGraphImpl
+from .npartiteGraph import npartiteGraphImpl
+
diff --git a/python/cugraph/structure/graph_implementation/npartiteGraph.py b/python/cugraph/structure/graph_implementation/npartiteGraph.py
new file mode 100644
index 00000000000..111d9f792fa
--- /dev/null
+++ b/python/cugraph/structure/graph_implementation/npartiteGraph.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .simpleGraph import simpleGraphImpl
+import cudf
+
+
+class npartiteGraphImpl(simpleGraphImpl):
+    def __init__(self, properties):
+        super(npartiteGraphImpl, self).__init__(properties)
+        self.properties.bipartite = properties.bipartite
+
+    # API may change in future
+    def __from_edgelist(
+        self,
+        input_df,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+    ):
+        self._simpleGraphImpl__from_edgelist(
+            input_df,
+            source=source,
+            destination=destination,
+            edge_attr=edge_attr,
+            renumber=renumber,
+        )
+
+    def sets(self):
+        """
+        Returns the bipartite set of nodes. This solely relies on the user's
+        call of add_nodes_from with the bipartite parameter. This does not
+        parse the graph to compute bipartite sets. If bipartite argument was
+        not provided during add_nodes_from(), it raise an exception that the
+        graph is not bipartite.
+        """
+        # TO DO: Call coloring algorithm
+        set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
+        if self.properties.bipartite:
+            top = self._nodes[set_names[0]]
+            if len(set_names) == 2:
+                bottom = self._nodes[set_names[1]]
+            else:
+                bottom = cudf.Series(
+                    set(self.nodes().values_host) - set(top.values_host)
+                )
+            return top, bottom
+        else:
+            return {k: self._nodes[k] for k in set_names}
+
+    # API may change in future
+    def add_nodes_from(self, nodes, bipartite=None, multipartite=None):
+        """
+        Add nodes information to the Graph.
+        Parameters
+        ----------
+        nodes : list or cudf.Series
+            The nodes of the graph to be stored. If bipartite and multipartite
+            arguments are not passed, the nodes are considered to be a list of
+            all the nodes present in the Graph.
+        bipartite : str
+            Sets the Graph as bipartite. The nodes are stored as a set of nodes
+            of the partition named as bipartite argument.
+        multipartite : str
+            Sets the Graph as multipartite. The nodes are stored as a set of
+            nodes of the partition named as multipartite argument.
+        """
+        if bipartite is None and multipartite is None:
+            raise Exception("Partition not provided")
+        else:
+            set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
+            if multipartite is not None:
+                if self.properties.bipartite:
+                    raise Exception(
+                        "The Graph is bipartite. "
+                        "Use bipartite option instead."
+                    )
+            elif bipartite is not None:
+                if not self.properties.bipartite:
+                    raise Exception(
+                        "The Graph is set as npartite. "
+                        "Use multipartite option instead.")
+                multipartite = bipartite
+                if multipartite not in set_names and len(set_names) == 2:
+                    raise Exception(
+                        "The Graph is set as bipartite and "
+                        "already has two partitions initialized."
+                    )
+            self._nodes[multipartite] = cudf.Series(nodes)
diff --git a/python/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/structure/graph_implementation/simpleDistributedGraph.py
new file mode 100644
index 00000000000..10d46cc3fed
--- /dev/null
+++ b/python/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -0,0 +1,481 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.graph_primtypes_wrapper import Direction
+from cugraph.structure.number_map import NumberMap
+import cudf
+import dask_cudf
+
+
+class simpleDistributedGraphImpl:
+    class EdgeList:
+        def __init__(self, ddf):
+            self.edgelist_df = ddf
+            self.weights = False
+            # FIXME: Edge Attribute not handled
+
+    # class AdjList:
+    # Not Supported
+
+    # class transposedAdjList:
+    # Not Supported
+
+    class Properties:
+        def __init__(self, properties):
+            self.multi_edge = getattr(properties, 'multi_edge', False)
+            self.directed = properties.directed
+            self.renumbered = False
+            self.store_transposed = False
+            self.self_loop = None
+            self.isolated_vertices = None
+            self.node_count = None
+            self.edge_count = None
+            self.weighted = False
+
+    def __init__(self, properties):
+        # Structure
+        self.edgelist = None
+        self.renumber_map = None
+        self.aggregate_segment_offsets = None
+        self.properties = simpleDistributedGraphImpl.Properties(properties)
+        self.source_columns = None
+        self.destination_columns = None
+
+    # Functions
+    def __from_edgelist(
+        self,
+        input_ddf,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+        store_transposed=False,
+    ):
+        if not isinstance(input_ddf, dask_cudf.DataFrame):
+            raise Exception("input should be a dask_cudf dataFrame")
+        if self.properties.directed is False:
+            raise Exception("Undirected distributed graph not supported")
+
+        s_col = source
+        d_col = destination
+        if not isinstance(s_col, list):
+            s_col = [s_col]
+        if not isinstance(d_col, list):
+            d_col = [d_col]
+        if not (
+            set(s_col).issubset(set(input_ddf.columns))
+            and set(d_col).issubset(set(input_ddf.columns))
+        ):
+            raise Exception(
+                "source column names and/or destination column "
+                "names not found in input. Recheck the source "
+                "and destination parameters"
+            )
+        ddf_columns = s_col + d_col
+        if edge_attr is not None:
+            if not (set([edge_attr]).issubset(set(input_ddf.columns))):
+                raise Exception(
+                    "edge_attr column name not found in input."
+                    "Recheck the edge_attr parameter")
+            self.weighted = True
+            ddf_columns = ddf_columns + [edge_attr]
+        input_ddf = input_ddf[ddf_columns]
+
+        if edge_attr is not None:
+            input_ddf = input_ddf.rename(columns={edge_attr: 'value'})
+
+        #
+        # Keep all of the original parameters so we can lazily
+        # evaluate this function
+        #
+
+        # FIXME: Edge Attribute not handled
+        self.properties.renumbered = renumber
+        self.input_df = input_ddf
+        self.source_columns = source
+        self.destination_columns = destination
+
+    def view_edge_list(self):
+        """
+        Display the edge list. Compute it if needed.
+        NOTE: If the graph is of type Graph() then the displayed undirected
+        edges are the same as displayed by networkx Graph(), but the direction
+        could be different i.e. an edge displayed by cugraph as (src, dst)
+        could be displayed as (dst, src) by networkx.
+        cugraph.Graph stores symmetrized edgelist internally. For displaying
+        undirected edgelist for a Graph the upper trianglar matrix of the
+        symmetrized edgelist is returned.
+        networkx.Graph renumbers the input and stores the upper triangle of
+        this renumbered input. Since the internal renumbering of networx and
+        cugraph is different, the upper triangular matrix of networkx
+        renumbered input may not be the same as cugraph's upper trianglar
+        matrix of the symmetrized edgelist. Hence the displayed source and
+        destination pairs in both will represent the same edge but node values
+        could be swapped.
+        Returns
+        -------
+        df : cudf.DataFrame
+            This cudf.DataFrame wraps source, destination and weight
+            df[src] : cudf.Series
+                contains the source index for each edge
+            df[dst] : cudf.Series
+                contains the destination index for each edge
+            df[weight] : cusd.Series
+                Column is only present for weighted Graph,
+                then containing the weight value for each edge
+        """
+        if self.edgelist is None:
+            raise Exception("Graph has no Edgelist.")
+        return self.edgelist.edgelist_df
+
+    def delete_edge_list(self):
+        """
+        Delete the edge list.
+        """
+        # decrease reference count to free memory if the referenced objects are
+        # no longer used.
+        self.edgelist = None
+
+    def clear(self):
+        """
+        Empty this graph. This function is added for NetworkX compatibility.
+        """
+        self.edgelist = None
+
+    def number_of_vertices(self):
+        """
+        Get the number of nodes in the graph.
+        """
+        if self.properties.node_count is None:
+            if self.edgelist is not None:
+                ddf = self.edgelist.edgelist_df[["src", "dst"]]
+                self.properties.node_count = ddf.max().max().compute() + 1
+            else:
+                raise Exception("Graph is Empty")
+        return self.properties.node_count
+
+    def number_of_nodes(self):
+        """
+        An alias of number_of_vertices(). This function is added for NetworkX
+        compatibility.
+        """
+        return self.number_of_vertices()
+
+    def number_of_edges(self, directed_edges=False):
+        """
+        Get the number of edges in the graph.
+        """
+        if self.edgelist is not None:
+            return len(self.edgelist.edgelist_df)
+        else:
+            raise Exception("Graph is Empty")
+
+    def in_degree(self, vertex_subset=None):
+        """
+        Compute vertex in-degree. Vertex in-degree is the number of edges
+        pointing into the vertex. By default, this method computes vertex
+        degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding in-degree.
+            If not set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the in_degree. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df[vertex] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df[degree] : cudf.Series
+                The computed in-degree of the corresponding vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.in_degree([0,9,12])
+        """
+        return self._degree(vertex_subset, direction=Direction.IN)
+
+    def out_degree(self, vertex_subset=None):
+        """
+        Compute vertex out-degree. Vertex out-degree is the number of edges
+        pointing out from the vertex. By default, this method computes vertex
+        degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding out-degree.
+            If not set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the out_degree. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df[vertex] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df[degree] : cudf.Series
+                The computed out-degree of the corresponding vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.out_degree([0,9,12])
+        """
+        return self._degree(vertex_subset, direction=Direction.OUT)
+
+    def degree(self, vertex_subset=None):
+        """
+        Compute vertex degree, which is the total number of edges incident
+        to a vertex (both in and out edges). By default, this method computes
+        degrees for the entire set of vertices. If vertex_subset is provided,
+        then this method optionally filters out all but those listed in
+        vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            a container of vertices for displaying corresponding degree. If not
+            set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the degree. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df['vertex'] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df['degree'] : cudf.Series
+                The computed degree of the corresponding vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> all_df = G.degree()
+        >>> subset_df = G.degree([0,9,12])
+        """
+        raise Exception("Not supported for distributed graph")
+
+    # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
+    def degrees(self, vertex_subset=None):
+        """
+        Compute vertex in-degree and out-degree. By default, this method
+        computes vertex degrees for the entire set of vertices. If
+        vertex_subset is provided, this method optionally filters out all but
+        those listed in vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding degree. If not
+            set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the degrees. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df['vertex'] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df['in_degree'] : cudf.Series
+                The in-degree of the vertex.
+            df['out_degree'] : cudf.Series
+                The out-degree of the vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.degrees([0,9,12])
+        """
+        raise Exception("Not supported for distributed graph")
+
+    def _degree(self, vertex_subset, direction=Direction.ALL):
+        vertex_col, degree_col = graph_primtypes_wrapper._mg_degree(self,
+                                                                    direction)
+        df = cudf.DataFrame()
+        df["vertex"] = vertex_col
+        df["degree"] = degree_col
+
+        if self.properties.renumbered is True:
+            df = self.renumber_map.unrenumber(df, "vertex")
+
+        if vertex_subset is not None:
+            df = df[df['vertex'].isin(vertex_subset)]
+
+        return df
+
+    def to_directed(self, DiG):
+        """
+        Return a directed representation of the graph.
+        This function sets the type of graph as DiGraph() and returns the
+        directed view.
+        Returns
+        -------
+        G : DiGraph
+            A directed graph with the same nodes, and each edge (u,v,weights)
+            replaced by two directed edges (u,v,weights) and (v,u,weights).
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> DiG = G.to_directed()
+        """
+        # TODO: Add support
+        raise Exception("Not supported for distributed graph")
+
+    def to_undirected(self, G):
+        """
+        Return an undirected copy of the graph.
+        Returns
+        -------
+        G : Graph
+            A undirected graph with the same nodes, and each directed edge
+            (u,v,weights) replaced by an undirected edge (u,v,weights).
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> DiG = cugraph.DiGraph()
+        >>> DiG.from_cudf_edgelist(M, '0', '1')
+        >>> G = DiG.to_undirected()
+        """
+
+        # TODO: Add support
+        raise Exception("Not supported for distributed graph")
+
+    def has_node(self, n):
+        """
+        Returns True if the graph contains the node n.
+        """
+        if self.edgelist is None:
+            raise Exception("Graph has no Edgelist.")
+        # FIXME: Check renumber map
+        ddf = self.edgelist.edgelist_df[["src", "dst"]]
+        return (ddf == n).any().any().compute()
+
+    def has_edge(self, u, v):
+        """
+        Returns True if the graph contains the edge (u,v).
+        """
+        # TODO: Verify Correctness
+        if self.properties.renumbered:
+            tmp = cudf.DataFrame({"src": [u, v]})
+            tmp = tmp.astype({"src": "int"})
+            tmp = self.add_internal_vertex_id(
+                tmp, "id", "src", preserve_order=True
+            )
+
+            u = tmp["id"][0]
+            v = tmp["id"][1]
+
+        df = self.edgelist.edgelist_df
+        return ((df["src"] == u) & (df["dst"] == v)).any().compute()
+
+    def edges(self):
+        """
+        Returns all the edges in the graph as a cudf.DataFrame containing
+        sources and destinations. It does not return the edge weights.
+        For viewing edges with weights use view_edge_list()
+        """
+        return self.view_edge_list()[["src", "dst"]]
+
+    def nodes(self):
+        """
+        Returns all the nodes in the graph as a cudf.Series
+        """
+        # FIXME: Return renumber map nodes
+        raise Exception("Not supported for distributed graph")
+
+    def neighbors(self, n):
+        if self.edgelist is None:
+            raise Exception("Graph has no Edgelist.")
+        # FIXME: Add renumbering of node n
+        ddf = self.edgelist.edgelist_df
+        return ddf[ddf["src"] == n]["dst"].reset_index(drop=True)
+
+    def compute_renumber_edge_list(self, transposed=False):
+        """
+        Compute a renumbered edge list
+        This function works in the MNMG pipeline and will transform
+        the input dask_cudf.DataFrame into a renumbered edge list
+        in the prescribed direction.
+        This function will be called by the algorithms to ensure
+        that the graph is renumbered properly.  The graph object will
+        cache the most recent renumbering attempt.  For benchmarking
+        purposes, this function can be called prior to calling a
+        graph algorithm so we can measure the cost of computing
+        the renumbering separately from the cost of executing the
+        algorithm.
+        When creating a CSR-like structure, set transposed to False.
+        When creating a CSC-like structure, set transposed to True.
+        Parameters
+        ----------
+        transposed : (optional) bool
+            If True, renumber with the intent to make a CSC-like
+            structure.  If False, renumber with the intent to make
+            a CSR-like structure.  Defaults to False.
+        """
+        # FIXME:  What to do about edge_attr???
+        #         currently ignored for MNMG
+
+        if not self.properties.renumbered:
+            self.edgelist = self.EdgeList(self.input_df)
+            self.renumber_map = None
+        else:
+            if self.edgelist is not None:
+                if self.properties.directed is False:
+                    return
+
+                if self.properties.store_transposed == transposed:
+                    return
+
+                del self.edgelist
+
+            renumbered_ddf, number_map, aggregate_segment_offsets = \
+                NumberMap.renumber_and_segment(self.input_df,
+                                               self.source_columns,
+                                               self.destination_columns,
+                                               store_transposed=transposed)
+            self.edgelist = self.EdgeList(renumbered_ddf)
+            self.renumber_map = number_map
+            self.aggregate_segment_offsets = aggregate_segment_offsets
+            self.properties.store_transposed = transposed
+
+    def vertex_column_size(self):
+        if self.properties.renumbered:
+            return self.renumber_map.vertex_column_size()
+        else:
+            return 1
diff --git a/python/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/structure/graph_implementation/simpleGraph.py
new file mode 100644
index 00000000000..e74b04c00b5
--- /dev/null
+++ b/python/cugraph/structure/graph_implementation/simpleGraph.py
@@ -0,0 +1,831 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.graph_primtypes_wrapper import Direction
+from cugraph.structure.symmetrize import symmetrize
+from cugraph.structure.number_map import NumberMap
+import cugraph.dask.common.mg_utils as mg_utils
+import cudf
+import dask_cudf
+import cugraph.comms.comms as Comms
+import pandas as pd
+import numpy as np
+from cugraph.dask.structure import replication
+
+
+# FIXME: Change to consistent camel case naming
+class simpleGraphImpl:
+
+    class EdgeList:
+        def __init__(self, source, destination, edge_attr=None):
+            self.edgelist_df = cudf.DataFrame()
+            self.edgelist_df["src"] = source
+            self.edgelist_df["dst"] = destination
+            self.weights = False
+            if edge_attr is not None:
+                self.weights = True
+                if type(edge_attr) is dict:
+                    for k in edge_attr.keys():
+                        self.edgelist_df[k] = edge_attr[k]
+                else:
+                    self.edgelist_df["weights"] = edge_attr
+
+    class AdjList:
+        def __init__(self, offsets, indices, value=None):
+            self.offsets = offsets
+            self.indices = indices
+            self.weights = value  # Should be a dataframe for multiple weights
+
+    class transposedAdjList:
+        def __init__(self, offsets, indices, value=None):
+            simpleGraphImpl.AdjList.__init__(self, offsets, indices, value)
+
+    class Properties:
+        def __init__(self, properties):
+            self.multi_edge = getattr(properties, 'multi_edge', False)
+            self.directed = properties.directed
+            self.renumbered = False
+            self.self_loop = None
+            self.isolated_vertices = None
+            self.node_count = None
+            self.edge_count = None
+            self.weighted = False
+
+    def __init__(self, properties):
+        # Structure
+        self.edgelist = None
+        self.adjlist = None
+        self.transposedadjlist = None
+        self.renumber_map = None
+        self.properties = simpleGraphImpl.Properties(properties)
+        self._nodes = {}
+
+        # TODO: Move to new batch class
+        # MG - Batch
+        self.batch_enabled = False
+        self.batch_edgelists = None
+        self.batch_adjlists = None
+        self.batch_transposed_adjlists = None
+
+    # Functions
+    # FIXME: Change to public function
+    # FIXME: Make function more modular
+    def __from_edgelist(
+        self,
+        input_df,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+    ):
+
+        # Verify column names present in input DataFrame
+        s_col = source
+        d_col = destination
+        if not isinstance(s_col, list):
+            s_col = [s_col]
+        if not isinstance(d_col, list):
+            d_col = [d_col]
+        if not (
+            set(s_col).issubset(set(input_df.columns))
+            and set(d_col).issubset(set(input_df.columns))
+        ):
+            # FIXME: Raise concrete Exceptions
+            raise Exception(
+                "source column names and/or destination column "
+                "names not found in input. Recheck the source and "
+                "destination parameters"
+            )
+
+        # FIXME: check if the consolidated graph fits on the
+        # device before gathering all the edge lists
+
+        # Consolidation
+        if isinstance(input_df, cudf.DataFrame):
+            if len(input_df[source]) > 2147483100:
+                raise Exception(
+                    "cudf dataFrame edge list is too big "
+                    "to fit in a single GPU"
+                )
+            elist = input_df
+        elif isinstance(input_df, dask_cudf.DataFrame):
+            if len(input_df[source]) > 2147483100:
+                raise Exception(
+                    "dask_cudf dataFrame edge list is too big "
+                    "to fit in a single GPU"
+                )
+            elist = input_df.compute().reset_index(drop=True)
+        else:
+            raise Exception(
+                "input should be a cudf.DataFrame or "
+                "a dask_cudf dataFrame"
+            )
+
+        # Renumbering
+        self.renumber_map = None
+        if renumber:
+            # FIXME: Should SG do lazy evaluation like MG?
+            elist, renumber_map = NumberMap.renumber(
+                elist, source, destination, store_transposed=False
+            )
+            source = "src"
+            destination = "dst"
+            self.properties.renumbered = True
+            self.renumber_map = renumber_map
+        else:
+            if type(source) is list and type(destination) is list:
+                raise Exception("set renumber to True for multi column ids")
+
+        # Populate graph edgelist
+        source_col = elist[source]
+        dest_col = elist[destination]
+
+        if edge_attr is not None:
+            self.weighted = True
+            value_col = elist[edge_attr]
+        else:
+            value_col = None
+
+        # TODO: Update Symmetrize to work on Graph and/or DataFrame
+        if value_col is not None:
+            source_col, dest_col, value_col = symmetrize(
+                source_col, dest_col, value_col,
+                multi=self.properties.multi_edge,
+                symmetrize=not self.properties.directed)
+            if isinstance(value_col, cudf.DataFrame):
+                value_dict = {}
+                for i in value_col.columns:
+                    value_dict[i] = value_col[i]
+                value_col = value_dict
+        else:
+            source_col, dest_col = symmetrize(
+                source_col, dest_col, multi=self.properties.multi_edge,
+                symmetrize=not self.properties.directed)
+
+        self.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col,
+                                                 value_col)
+
+        if self.batch_enabled:
+            self._replicate_edgelist()
+
+    def to_pandas_edgelist(self, source='source', destination='destination'):
+        """
+        Returns the graph edge list as a Pandas DataFrame.
+        Parameters
+        ----------
+        source : str or array-like
+            source column name or array of column names
+        destination : str or array-like
+            destination column name or array of column names
+        Returns
+        -------
+        df : pandas.DataFrame
+        """
+
+        gdf = self.view_edge_list()
+        return gdf.to_pandas()
+
+    def to_pandas_adjacency(self):
+        """
+        Returns the graph adjacency matrix as a Pandas DataFrame.
+        """
+
+        np_array_data = self.to_numpy_array()
+        pdf = pd.DataFrame(np_array_data)
+        if self.properties.renumbered:
+            nodes = self.renumber_map.implementation.df['0'].\
+                    values_host.tolist()
+        pdf.columns = nodes
+        pdf.index = nodes
+        return pdf
+
+    def to_numpy_array(self):
+        """
+        Returns the graph adjacency matrix as a NumPy array.
+        """
+
+        nlen = self.number_of_nodes()
+        elen = self.number_of_edges()
+        df = self.edgelist.edgelist_df
+        np_array = np.full((nlen, nlen), 0.0)
+        for i in range(0, elen):
+            np_array[df['src'].iloc[i], df['dst'].iloc[i]] = df['weights'].\
+                                                             iloc[i]
+        return np_array
+
+    def to_numpy_matrix(self):
+        """
+        Returns the graph adjacency matrix as a NumPy matrix.
+        """
+        np_array = self.to_numpy_array()
+        return np.asmatrix(np_array)
+
+    def view_edge_list(self):
+        """
+        Display the edge list. Compute it if needed.
+        NOTE: If the graph is of type Graph() then the displayed undirected
+        edges are the same as displayed by networkx Graph(), but the direction
+        could be different i.e. an edge displayed by cugraph as (src, dst)
+        could be displayed as (dst, src) by networkx.
+        cugraph.Graph stores symmetrized edgelist internally. For displaying
+        undirected edgelist for a Graph the upper trianglar matrix of the
+        symmetrized edgelist is returned.
+        networkx.Graph renumbers the input and stores the upper triangle of
+        this renumbered input. Since the internal renumbering of networx and
+        cugraph is different, the upper triangular matrix of networkx
+        renumbered input may not be the same as cugraph's upper trianglar
+        matrix of the symmetrized edgelist. Hence the displayed source and
+        destination pairs in both will represent the same edge but node values
+        could be swapped.
+        Returns
+        -------
+        df : cudf.DataFrame
+            This cudf.DataFrame wraps source, destination and weight
+            df[src] : cudf.Series
+                contains the source index for each edge
+            df[dst] : cudf.Series
+                contains the destination index for each edge
+            df[weight] : cusd.Series
+                Column is only present for weighted Graph,
+                then containing the weight value for each edge
+        """
+        if self.edgelist is None:
+            src, dst, weights = graph_primtypes_wrapper.view_edge_list(self)
+            self.edgelist = self.EdgeList(src, dst, weights)
+
+        edgelist_df = self.edgelist.edgelist_df
+
+        if self.properties.renumbered:
+            edgelist_df = self.renumber_map.unrenumber(edgelist_df, "src")
+            edgelist_df = self.renumber_map.unrenumber(edgelist_df, "dst")
+
+        if not self.properties.directed:
+            edgelist_df = edgelist_df[edgelist_df["src"] <= edgelist_df["dst"]]
+            edgelist_df = edgelist_df.reset_index(drop=True)
+            self.properties.edge_count = len(edgelist_df)
+
+        return edgelist_df
+
+    def delete_edge_list(self):
+        """
+        Delete the edge list.
+        """
+        # decrease reference count to free memory if the referenced objects are
+        # no longer used.
+        self.edgelist = None
+
+    def __from_adjlist(self, offset_col, index_col, value_col=None):
+        self.adjlist = simpleGraphImpl.AdjList(offset_col, index_col,
+                                               value_col)
+
+        if self.batch_enabled:
+            self._replicate_adjlist()
+
+    def view_adj_list(self):
+        """
+        Display the adjacency list. Compute it if needed.
+        Returns
+        -------
+        offset_col : cudf.Series
+            This cudf.Series wraps a gdf_column of size V + 1 (V: number of
+            vertices).
+            The gdf column contains the offsets for the vertices in this graph.
+            Offsets are in the range [0, E] (E: number of edges).
+        index_col : cudf.Series
+            This cudf.Series wraps a gdf_column of size E (E: number of edges).
+            The gdf column contains the destination index for each edge.
+            Destination indices are in the range [0, V) (V: number of
+            vertices).
+        value_col : cudf.Series or ``None``
+            This pointer is ``None`` for unweighted graphs.
+            For weighted graphs, this cudf.Series wraps a gdf_column of size E
+            (E: number of edges).
+            The gdf column contains the weight value for each edge.
+            The expected type of the gdf_column element is floating point
+            number.
+        """
+
+        if self.adjlist is None:
+            if self.transposedadjlist is not None and\
+             self.properties.directed is False:
+                off, ind, vals = (
+                    self.transposedadjlist.offsets,
+                    self.transposedadjlist.indices,
+                    self.transposedadjlist.weights,
+                )
+            else:
+                off, ind, vals = graph_primtypes_wrapper.view_adj_list(self)
+            self.adjlist = self.AdjList(off, ind, vals)
+
+            if self.batch_enabled:
+                self._replicate_adjlist()
+
+        return self.adjlist.offsets, self.adjlist.indices, self.adjlist.weights
+
+    def view_transposed_adj_list(self):
+        """
+        Display the transposed adjacency list. Compute it if needed.
+        Returns
+        -------
+        offset_col : cudf.Series
+            This cudf.Series wraps a gdf_column of size V + 1 (V: number of
+            vertices).
+            The gdf column contains the offsets for the vertices in this graph.
+            Offsets are in the range [0, E] (E: number of edges).
+        index_col : cudf.Series
+            This cudf.Series wraps a gdf_column of size E (E: number of edges).
+            The gdf column contains the destination index for each edge.
+            Destination indices are in the range [0, V) (V: number of
+            vertices).
+        value_col : cudf.Series or ``None``
+            This pointer is ``None`` for unweighted graphs.
+            For weighted graphs, this cudf.Series wraps a gdf_column of size E
+            (E: number of edges).
+            The gdf column contains the weight value for each edge.
+            The expected type of the gdf_column element is floating point
+            number.
+        """
+
+        if self.transposedadjlist is None:
+            if self.adjlist is not None and self.properties.directed is False:
+                off, ind, vals = (
+                    self.adjlist.offsets,
+                    self.adjlist.indices,
+                    self.adjlist.weights,
+                )
+            else:
+                (
+                    off,
+                    ind,
+                    vals,
+                ) = graph_primtypes_wrapper.view_transposed_adj_list(self)
+            self.transposedadjlist = self.transposedAdjList(off, ind, vals)
+
+            if self.batch_enabled:
+                self._replicate_transposed_adjlist()
+
+        return (
+            self.transposedadjlist.offsets,
+            self.transposedadjlist.indices,
+            self.transposedadjlist.weights,
+        )
+
+    def delete_adj_list(self):
+        """
+        Delete the adjacency list.
+        """
+        self.adjlist = None
+
+    # FIXME: Update batch workflow and refactor to suitable file
+    def enable_batch(self):
+        client = mg_utils.get_client()
+        comms = Comms.get_comms()
+
+        if client is None or comms is None:
+            msg = (
+                "MG Batch needs a Dask Client and the "
+                "Communicator needs to be initialized."
+            )
+            raise Exception(msg)
+
+        self.batch_enabled = True
+
+        if self.edgelist is not None:
+            if self.batch_edgelists is None:
+                self._replicate_edgelist()
+
+        if self.adjlist is not None:
+            if self.batch_adjlists is None:
+                self._replicate_adjlist()
+
+        if self.transposedadjlist is not None:
+            if self.batch_transposed_adjlists is None:
+                self._replicate_transposed_adjlist()
+
+    def _replicate_edgelist(self):
+        client = mg_utils.get_client()
+        comms = Comms.get_comms()
+
+        # FIXME: There  might be a better way to control it
+        if client is None:
+            return
+        work_futures = replication.replicate_cudf_dataframe(
+            self.edgelist.edgelist_df, client=client, comms=comms
+        )
+
+        self.batch_edgelists = work_futures
+
+    def _replicate_adjlist(self):
+        client = mg_utils.get_client()
+        comms = Comms.get_comms()
+
+        # FIXME: There  might be a better way to control it
+        if client is None:
+            return
+
+        weights = None
+        offsets_futures = replication.replicate_cudf_series(
+            self.adjlist.offsets, client=client, comms=comms
+        )
+        indices_futures = replication.replicate_cudf_series(
+            self.adjlist.indices, client=client, comms=comms
+        )
+
+        if self.adjlist.weights is not None:
+            weights = replication.replicate_cudf_series(self.adjlist.weights)
+        else:
+            weights = {worker: None for worker in offsets_futures}
+
+        merged_futures = {
+            worker: [
+                offsets_futures[worker],
+                indices_futures[worker],
+                weights[worker],
+            ]
+            for worker in offsets_futures
+        }
+        self.batch_adjlists = merged_futures
+
+    # FIXME: Not implemented yet
+    def _replicate_transposed_adjlist(self):
+        self.batch_transposed_adjlists = True
+
+    def get_two_hop_neighbors(self):
+        """
+        Compute vertex pairs that are two hops apart. The resulting pairs are
+        sorted before returning.
+        Returns
+        -------
+        df : cudf.DataFrame
+            df[first] : cudf.Series
+                the first vertex id of a pair, if an external vertex id
+                is defined by only one column
+            df[second] : cudf.Series
+                the second vertex id of a pair, if an external vertex id
+                is defined by only one column
+        """
+
+        df = graph_primtypes_wrapper.get_two_hop_neighbors(self)
+
+        if self.properties.renumbered is True:
+            df = self.renumber_map.unrenumber(df, "first")
+            df = self.renumber_map.unrenumber(df, "second")
+
+        return df
+
+    def number_of_vertices(self):
+        """
+        Get the number of nodes in the graph.
+        """
+        if self.properties.node_count is None:
+            if self.adjlist is not None:
+                self.properties.node_count = len(self.adjlist.offsets) - 1
+            elif self.transposedadjlist is not None:
+                self.properties.node_count = len(
+                    self.transposedadjlist.offsets) - 1
+            elif self.edgelist is not None:
+                df = self.edgelist.edgelist_df[["src", "dst"]]
+                self.properties.node_count = df.max().max() + 1
+            else:
+                raise Exception("Graph is Empty")
+        return self.properties.node_count
+
+    def number_of_nodes(self):
+        """
+        An alias of number_of_vertices(). This function is added for NetworkX
+        compatibility.
+        """
+        return self.number_of_vertices()
+
+    def number_of_edges(self, directed_edges=False):
+        """
+        Get the number of edges in the graph.
+        """
+        # TODO: Move to Outer graphs?
+        if directed_edges and self.edgelist is not None:
+            return len(self.edgelist.edgelist_df)
+        if self.properties.edge_count is None:
+            if self.edgelist is not None:
+                if self.properties.directed is False:
+                    self.properties.edge_count = len(
+                        self.edgelist.edgelist_df[
+                            self.edgelist.edgelist_df["src"]
+                            >= self.edgelist.edgelist_df["dst"]
+                        ]
+                    )
+                else:
+                    self.properties.edge_count = len(self.edgelist.edgelist_df)
+            elif self.adjlist is not None:
+                self.properties.edge_count = len(self.adjlist.indices)
+            elif self.transposedadjlist is not None:
+                self.properties.edge_count = len(
+                    self.transposedadjlist.indices)
+            else:
+                raise ValueError("Graph is Empty")
+        return self.properties.edge_count
+
+    def in_degree(self, vertex_subset=None):
+        """
+        Compute vertex in-degree. Vertex in-degree is the number of edges
+        pointing into the vertex. By default, this method computes vertex
+        degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding in-degree.
+            If not set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the in_degree. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df[vertex] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df[degree] : cudf.Series
+                The computed in-degree of the corresponding vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.in_degree([0,9,12])
+        """
+        return self._degree(vertex_subset, direction=Direction.IN)
+
+    def out_degree(self, vertex_subset=None):
+        """
+        Compute vertex out-degree. Vertex out-degree is the number of edges
+        pointing out from the vertex. By default, this method computes vertex
+        degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding out-degree.
+            If not set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the out_degree. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df[vertex] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df[degree] : cudf.Series
+                The computed out-degree of the corresponding vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.out_degree([0,9,12])
+        """
+        return self._degree(vertex_subset, direction=Direction.OUT)
+
+    def degree(self, vertex_subset=None):
+        """
+        Compute vertex degree, which is the total number of edges incident
+        to a vertex (both in and out edges). By default, this method computes
+        degrees for the entire set of vertices. If vertex_subset is provided,
+        then this method optionally filters out all but those listed in
+        vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            a container of vertices for displaying corresponding degree. If not
+            set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the degree. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df['vertex'] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df['degree'] : cudf.Series
+                The computed degree of the corresponding vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> all_df = G.degree()
+        >>> subset_df = G.degree([0,9,12])
+        """
+        return self._degree(vertex_subset)
+
+    # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
+    def degrees(self, vertex_subset=None):
+        """
+        Compute vertex in-degree and out-degree. By default, this method
+        computes vertex degrees for the entire set of vertices. If
+        vertex_subset is provided, this method optionally filters out all but
+        those listed in vertex_subset.
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding degree. If not
+            set, degrees are computed for the entire set of vertices.
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the degrees. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+            df['vertex'] : cudf.Series
+                The vertex IDs (will be identical to vertex_subset if
+                specified).
+            df['in_degree'] : cudf.Series
+                The in-degree of the vertex.
+            df['out_degree'] : cudf.Series
+                The out-degree of the vertex.
+        Examples
+        --------
+        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.degrees([0,9,12])
+        """
+        (
+            vertex_col,
+            in_degree_col,
+            out_degree_col,
+        ) = graph_primtypes_wrapper._degrees(self)
+
+        df = cudf.DataFrame()
+        df["vertex"] = vertex_col
+        df["in_degree"] = in_degree_col
+        df["out_degree"] = out_degree_col
+
+        if self.properties.renumbered is True:
+            df = self.renumber_map.unrenumber(df, "vertex")
+
+        if vertex_subset is not None:
+            df = df[df['vertex'].isin(vertex_subset)]
+
+        return df
+
+    def _degree(self, vertex_subset, direction=Direction.ALL):
+        vertex_col, degree_col = graph_primtypes_wrapper._degree(self,
+                                                                 direction)
+        df = cudf.DataFrame()
+        df["vertex"] = vertex_col
+        df["degree"] = degree_col
+
+        if self.properties.renumbered is True:
+            df = self.renumber_map.unrenumber(df, "vertex")
+
+        if vertex_subset is not None:
+            df = df[df['vertex'].isin(vertex_subset)]
+
+        return df
+
+    def to_directed(self, DiG):
+        """
+        Return a directed representation of the graph Implementation.
+        This function copies the internal structures and returns the
+        directed view.
+        """
+        DiG.properties.renumbered = self.properties.renumbered
+        DiG.renumber_map = self.renumber_map
+        DiG.edgelist = self.edgelist
+        DiG.adjlist = self.adjlist
+        DiG.transposedadjlist = self.transposedadjlist
+
+    def to_undirected(self, G):
+        """
+        Return an undirected copy of the graph.
+        """
+        G.properties.renumbered = self.properties.renumbered
+        G.renumber_map = self.renumber_map
+        if self.properties.directed is False:
+            G.edgelist = self.edgelist
+            G.adjlist = self.adjlist
+            G.transposedadjlist = self.transposedadjlist
+        else:
+            df = self.edgelist.edgelist_df
+            if self.edgelist.weights:
+                source_col, dest_col, value_col = symmetrize(
+                    df["src"], df["dst"], df["weights"]
+                )
+            else:
+                source_col, dest_col = symmetrize(df["src"], df["dst"])
+                value_col = None
+            G.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col,
+                                                  value_col)
+
+    def has_node(self, n):
+        """
+        Returns True if the graph contains the node n.
+        """
+        if self.properties.renumbered:
+            tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
+            return tmp[0] is not cudf.NA and tmp[0] >= 0
+        else:
+            df = self.edgelist.edgelist_df[["src", "dst"]]
+            return (df == n).any().any()
+
+    def has_edge(self, u, v):
+        """
+        Returns True if the graph contains the edge (u,v).
+        """
+        if self.properties.renumbered:
+            tmp = cudf.DataFrame({"src": [u, v]})
+            tmp = tmp.astype({"src": "int"})
+            tmp = self.renumber_map.add_internal_vertex_id(
+                tmp, "id", "src", preserve_order=True
+            )
+
+            u = tmp["id"][0]
+            v = tmp["id"][1]
+
+        df = self.edgelist.edgelist_df
+        return ((df["src"] == u) & (df["dst"] == v)).any()
+
+    def has_self_loop(self):
+        """
+        Returns True if the graph has self loop.
+        """
+        # Detect self loop
+        if self.properties.self_loop is None:
+            elist = self.edgelist.edgelist_df
+            if (elist["src"] == elist["dst"]).any():
+                self.properties.self_loop = True
+            else:
+                self.properties.self_loop = False
+        return self.properties.self_loop
+
+    def edges(self):
+        """
+        Returns all the edges in the graph as a cudf.DataFrame containing
+        sources and destinations. It does not return the edge weights.
+        For viewing edges with weights use view_edge_list()
+        """
+        return self.view_edge_list()[["src", "dst"]]
+
+    def nodes(self):
+        """
+        Returns all the nodes in the graph as a cudf.Series
+        """
+        if self.edgelist is not None:
+            df = self.edgelist.edgelist_df
+            if self.properties.renumbered:
+                # FIXME: If vertices are multicolumn
+                #        this needs to return a dataframe
+                # FIXME: This relies on current implementation
+                #        of NumberMap, should not really expose
+                #        this, perhaps add a method to NumberMap
+                return self.renumber_map.implementation.df["0"]
+            else:
+                return cudf.concat([df["src"], df["dst"]]).unique()
+        if self.adjlist is not None:
+            return cudf.Series(np.arange(0, self.number_of_nodes()))
+
+    def neighbors(self, n):
+        if self.edgelist is None:
+            raise Exception("Graph has no Edgelist.")
+        if self.properties.renumbered:
+            node = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
+            if len(node) == 0:
+                return cudf.Series(dtype="int")
+            n = node[0]
+
+        df = self.edgelist.edgelist_df
+        neighbors = df[df["src"] == n]["dst"].reset_index(drop=True)
+        if self.properties.renumbered:
+            # FIXME:  Multi-column vertices
+            return self.renumber_map.from_internal_vertex_id(neighbors)["0"]
+        else:
+            return neighbors
+
+    def vertex_column_size(self):
+        if self.properties.renumbered:
+            return self.renumber_map.vertex_column_size()
+        else:
+            return 1
diff --git a/python/cugraph/structure/graph_primtypes.pxd b/python/cugraph/structure/graph_primtypes.pxd
index e46f4092dd4..20581a8ecc0 100644
--- a/python/cugraph/structure/graph_primtypes.pxd
+++ b/python/cugraph/structure/graph_primtypes.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,24 +18,22 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
+from libcpp.utility cimport pair
+from libcpp.vector cimport vector
+from cugraph.raft.common.handle cimport *
 from rmm._lib.device_buffer cimport device_buffer
 
-cdef extern from "raft/handle.hpp" namespace "raft":
-    cdef cppclass handle_t:
-        handle_t() except +
-
-cdef extern from "graph.hpp" namespace "cugraph":
+cdef extern from "cugraph/legacy/graph.hpp" namespace "cugraph::legacy":
 
     ctypedef enum PropType:
-        PROP_UNDEF "cugraph::PROP_UNDEF"
-        PROP_FALSE "cugraph::PROP_FALSE"
-        PROP_TRUE "cugraph::PROP_TRUE"
+        PROP_UNDEF "cugraph::legacy::PROP_UNDEF"
+        PROP_FALSE "cugraph::legacy::PROP_FALSE"
+        PROP_TRUE "cugraph::legacy::PROP_TRUE"
 
     ctypedef enum DegreeDirection:
-        DIRECTION_IN_PLUS_OUT "cugraph::DegreeDirection::IN_PLUS_OUT"
-        DIRECTION_IN "cugraph::DegreeDirection::IN"
-        DIRECTION_OUT "cugraph::DegreeDirection::OUT"
+        DIRECTION_IN_PLUS_OUT "cugraph::legacy::DegreeDirection::IN_PLUS_OUT"
+        DIRECTION_IN "cugraph::legacy::DegreeDirection::IN"
+        DIRECTION_OUT "cugraph::legacy::DegreeDirection::OUT"
 
     struct GraphProperties:
         bool directed
@@ -125,22 +123,11 @@ cdef extern from "graph.hpp" namespace "cugraph":
         GraphCSRView[VT,ET,WT] view()
 
 
-
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef unique_ptr[GraphCOO[VT, ET, WT]] get_two_hop_neighbors[VT,ET,WT](
         const GraphCSRView[VT, ET, WT] &graph) except +
 
-cdef extern from "functions.hpp" namespace "cugraph":
-
-    cdef unique_ptr[device_buffer] renumber_vertices[VT_IN,VT_OUT,ET](
-        ET number_of_edges,
-        const VT_IN *src,
-        const VT_IN *dst,
-        VT_OUT *src_renumbered,
-        VT_OUT *dst_renumbered,
-        ET *map_size) except +
-
 
 cdef extern from "<utility>" namespace "std" nogil:
     cdef unique_ptr[GraphCOO[int,int,float]] move(unique_ptr[GraphCOO[int,int,float]])
@@ -190,54 +177,3 @@ ctypedef fused GraphViewType:
 cdef coo_to_df(GraphCOOPtrType graph)
 cdef csr_to_series(GraphCSRPtrType graph)
 cdef GraphViewType get_graph_view(input_graph, bool weightless=*, GraphViewType* dummy=*)
-
-
-# C++ utilities specifically for Cython
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
-
-    ctypedef enum numberTypeEnum:
-        int32Type "cugraph::cython::numberTypeEnum::int32Type"
-        int64Type "cugraph::cython::numberTypeEnum::int64Type"
-        floatType "cugraph::cython::numberTypeEnum::floatType"
-        doubleType "cugraph::cython::numberTypeEnum::doubleType"
-
-    cdef cppclass graph_container_t:
-       pass
-
-    cdef void populate_graph_container(
-        graph_container_t &graph_container,
-        handle_t &handle,
-        void *src_vertices,
-        void *dst_vertices,
-        void *weights,
-        void *vertex_partition_offsets,
-        numberTypeEnum vertexType,
-        numberTypeEnum edgeType,
-        numberTypeEnum weightType,
-        size_t num_partition_edges,
-        size_t num_global_vertices,
-        size_t num_global_edges,
-        bool sorted_by_degree,
-        bool transposed,
-        bool multi_gpu) except +
-
-    ctypedef enum graphTypeEnum:
-        LegacyCSR "cugraph::cython::graphTypeEnum::LegacyCSR"
-        LegacyCSC "cugraph::cython::graphTypeEnum::LegacyCSC"
-        LegacyCOO "cugraph::cython::graphTypeEnum::LegacyCOO"
-
-    cdef void populate_graph_container_legacy(
-        graph_container_t &graph_container,
-        graphTypeEnum legacyType,
-        const handle_t &handle,
-        void *offsets,
-        void *indices,
-        void *weights,
-        numberTypeEnum offsetType,
-        numberTypeEnum indexType,
-        numberTypeEnum weightType,
-        size_t num_global_vertices,
-        size_t num_global_edges,
-        int *local_vertices,
-        int *local_edges,
-        int *local_offsets) except +
diff --git a/python/cugraph/structure/graph_primtypes.pyx b/python/cugraph/structure/graph_primtypes.pyx
index f3f0fd9b9a6..da16f8f4c8a 100644
--- a/python/cugraph/structure/graph_primtypes.pyx
+++ b/python/cugraph/structure/graph_primtypes.pyx
@@ -93,6 +93,9 @@ cdef GraphCOOViewType get_coo_graph_view(input_graph, bool weighted=True, GraphC
     if not input_graph.edgelist:
         input_graph.view_edge_list()
 
+    num_edges = input_graph.number_of_edges(directed_edges=True)
+    num_verts = input_graph.number_of_vertices()
+
     cdef uintptr_t c_src = input_graph.edgelist.edgelist_df['src'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst = input_graph.edgelist.edgelist_df['dst'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_weights = <uintptr_t>NULL
@@ -101,8 +104,6 @@ cdef GraphCOOViewType get_coo_graph_view(input_graph, bool weighted=True, GraphC
     if input_graph.edgelist.weights and weighted:
         c_weights = input_graph.edgelist.edgelist_df['weights'].__cuda_array_interface__['data'][0]
 
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
     cdef GraphCOOViewType in_graph
     if GraphCOOViewType is GraphCOOViewFloat:
         in_graph = GraphCOOViewFloat(<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges)
diff --git a/python/cugraph/structure/graph_primtypes_wrapper.pyx b/python/cugraph/structure/graph_primtypes_wrapper.pyx
index 7bc62b9a1af..95de1d70732 100644
--- a/python/cugraph/structure/graph_primtypes_wrapper.pyx
+++ b/python/cugraph/structure/graph_primtypes_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,9 +18,9 @@
 
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure.graph_primtypes cimport get_two_hop_neighbors as c_get_two_hop_neighbors
-from cugraph.structure.graph_primtypes cimport renumber_vertices as c_renumber_vertices
 from cugraph.structure.utils_wrapper import *
 from libcpp cimport bool
+import enum
 from libc.stdint cimport uintptr_t
 
 from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer
@@ -45,43 +45,10 @@ def datatype_cast(cols, dtypes):
     return cols_out
 
 
-def renumber(source_col, dest_col):
-    num_edges = len(source_col)
-
-    src_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32)
-    dst_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32)
-
-    cdef uintptr_t c_src = source_col.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_dst = dest_col.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_src_renumbered = src_renumbered.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_dst_renumbered = dst_renumbered.__cuda_array_interface__['data'][0]
-    cdef int map_size = 0
-    cdef int n_edges = num_edges
-
-    cdef unique_ptr[device_buffer] numbering_map
-
-    if (source_col.dtype == np.int32):
-        numbering_map = move(c_renumber_vertices[int,int,int](n_edges,
-                                                              <int*>c_src,
-                                                              <int*>c_dst,
-                                                              <int*>c_src_renumbered,
-                                                              <int*>c_dst_renumbered,
-                                                              &map_size))
-    else:
-        numbering_map = move(c_renumber_vertices[long,int,int](n_edges,
-                                                               <long*>c_src,
-                                                               <long*>c_dst,
-                                                               <int*>c_src_renumbered,
-                                                               <int*>c_dst_renumbered,
-                                                               &map_size))
-
-
-    map = DeviceBuffer.c_from_unique_ptr(move(numbering_map))
-    map = Buffer(map)
-
-    output_map = cudf.Series(data=map, dtype=source_col.dtype)
-
-    return src_renumbered, dst_renumbered, output_map
+class Direction(enum.Enum):
+    ALL = 0
+    IN = 1
+    OUT = 2
 
 
 def view_adj_list(input_graph):
@@ -137,7 +104,7 @@ def view_edge_list(input_graph):
     return src_indices, indices, weights
 
 
-def _degree_coo(edgelist_df, src_name, dst_name, x=0, num_verts=None, sID=None):
+def _degree_coo(edgelist_df, src_name, dst_name, direction=Direction.ALL, num_verts=None, sID=None):
     #
     #  Computing the degree of the input graph from COO
     #
@@ -146,11 +113,11 @@ def _degree_coo(edgelist_df, src_name, dst_name, x=0, num_verts=None, sID=None):
     src = edgelist_df[src_name]
     dst = edgelist_df[dst_name]
 
-    if x == 0:
+    if direction == Direction.ALL:
         dir = DIRECTION_IN_PLUS_OUT
-    elif x == 1:
+    elif direction == Direction.IN:
         dir = DIRECTION_IN
-    elif x == 2:
+    elif direction == Direction.OUT:
         dir = DIRECTION_OUT
     else:
         raise Exception("x should be 0, 1 or 2")
@@ -185,17 +152,17 @@ def _degree_coo(edgelist_df, src_name, dst_name, x=0, num_verts=None, sID=None):
     return vertex_col, degree_col
 
 
-def _degree_csr(offsets, indices, x=0):
+def _degree_csr(offsets, indices, direction=Direction.ALL):
     cdef DegreeDirection dir
 
-    if x == 0:
+    if direction == Direction.ALL:
         dir = DIRECTION_IN_PLUS_OUT
-    elif x == 1:
+    elif direction == Direction.IN:
         dir = DIRECTION_IN
-    elif x == 2:
+    elif direction == Direction.OUT:
         dir = DIRECTION_OUT
     else:
-        raise Exception("x should be 0, 1 or 2")
+        raise Exception("direction should be 0, 1 or 2")
 
     [offsets, indices] = datatype_cast([offsets, indices], [np.int32])
 
@@ -220,44 +187,48 @@ def _degree_csr(offsets, indices, x=0):
     return vertex_col, degree_col
 
 
-def _degree(input_graph, x=0):
-    transpose_x = { 0: 0,
-                    2: 1,
-                    1: 2 }
+def _mg_degree(input_graph, direction=Direction.ALL):
+    if input_graph.edgelist is None:
+        input_graph.compute_renumber_edge_list(transposed=False)
+    input_ddf = input_graph.edgelist.edgelist_df
+    num_verts = input_ddf[['src', 'dst']].max().max().compute() + 1
+    data = DistributedDataHandler.create(data=input_ddf)
+    comms = Comms.get_comms()
+    client = default_client()
+    data.calculate_parts_to_sizes(comms)
+    if direction==Direction.IN:
+        degree_ddf = [client.submit(_degree_coo, wf[1][0], 'src', 'dst', Direction.IN, num_verts, comms.sessionId, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())]
+    if direction==Direction.OUT:
+        degree_ddf = [client.submit(_degree_coo, wf[1][0], 'dst', 'src', Direction.IN, num_verts, comms.sessionId, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())]
+    wait(degree_ddf)
+    return degree_ddf[0].result()
+
+
+def _degree(input_graph, direction=Direction.ALL):
+    transpose_direction = { Direction.ALL: Direction.ALL,
+                            Direction.IN: Direction.OUT,
+                            Direction.OUT: Direction.IN }
 
     if input_graph.adjlist is not None:
         return _degree_csr(input_graph.adjlist.offsets,
                            input_graph.adjlist.indices,
-                           x)
+                           direction)
 
     if input_graph.transposedadjlist is not None:
         return _degree_csr(input_graph.transposedadjlist.offsets,
                            input_graph.transposedadjlist.indices,
-                           transpose_x[x])
-
-    if input_graph.edgelist is None and input_graph.distributed:
-        input_graph.compute_renumber_edge_list(transposed=False)
+                           transpose_direction[direction])
 
     if input_graph.edgelist is not None:
-        if isinstance(input_graph.edgelist.edgelist_df, dc.DataFrame):
-            input_ddf = input_graph.edgelist.edgelist_df
-            num_verts = input_ddf[['src', 'dst']].max().max().compute() + 1
-            data = DistributedDataHandler.create(data=input_ddf)
-            comms = Comms.get_comms()
-            client = default_client()
-            data.calculate_parts_to_sizes(comms)
-            degree_ddf = [client.submit(_degree_coo, wf[1][0], 'src', 'dst', x, num_verts, comms.sessionId, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())]
-            wait(degree_ddf)
-            return degree_ddf[0].result()
         return _degree_coo(input_graph.edgelist.edgelist_df,
-                           'src', 'dst', x)
+                           'src', 'dst', direction)
 
     raise Exception("input_graph not COO, CSR or CSC")
 
 
 def _degrees(input_graph):
-    verts, indegrees = _degree(input_graph,1)
-    verts, outdegrees = _degree(input_graph, 2)
+    verts, indegrees = _degree(input_graph, Direction.IN)
+    verts, outdegrees = _degree(input_graph, Direction.OUT)
 
     return verts, indegrees, outdegrees
 
diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd
new file mode 100644
index 00000000000..b701ba8b400
--- /dev/null
+++ b/python/cugraph/structure/graph_utilities.pxd
@@ -0,0 +1,195 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+
+from cugraph.raft.common.handle cimport *
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport pair
+from libcpp.vector cimport vector
+from rmm._lib.device_buffer cimport device_buffer
+
+# C++ graph utilities
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+
+    ctypedef enum numberTypeEnum:
+        int32Type "cugraph::cython::numberTypeEnum::int32Type"
+        int64Type "cugraph::cython::numberTypeEnum::int64Type"
+        floatType "cugraph::cython::numberTypeEnum::floatType"
+        doubleType "cugraph::cython::numberTypeEnum::doubleType"
+
+    cdef cppclass graph_container_t:
+       pass
+
+    cdef void populate_graph_container(
+        graph_container_t &graph_container,
+        handle_t &handle,
+        void *src_vertices,
+        void *dst_vertices,
+        void *weights,
+        void *vertex_partition_offsets,
+        void *segment_offsets,
+        size_t num_segments,
+        numberTypeEnum vertexType,
+        numberTypeEnum edgeType,
+        numberTypeEnum weightType,
+        size_t num_local_edges,
+        size_t num_global_vertices,
+        size_t num_global_edges,
+        bool is_weighted,
+        bool is_symmetric,
+        bool transposed,
+        bool multi_gpu) except +
+
+    ctypedef enum graphTypeEnum:
+        LegacyCSR "cugraph::cython::graphTypeEnum::LegacyCSR"
+        LegacyCSC "cugraph::cython::graphTypeEnum::LegacyCSC"
+        LegacyCOO "cugraph::cython::graphTypeEnum::LegacyCOO"
+
+    cdef void populate_graph_container_legacy(
+        graph_container_t &graph_container,
+        graphTypeEnum legacyType,
+        const handle_t &handle,
+        void *offsets,
+        void *indices,
+        void *weights,
+        numberTypeEnum offsetType,
+        numberTypeEnum indexType,
+        numberTypeEnum weightType,
+        size_t num_global_vertices,
+        size_t num_global_edges,
+        int *local_vertices,
+        int *local_edges,
+        int *local_offsets) except +
+
+    cdef cppclass cy_multi_edgelists_t:
+        size_t number_of_vertices
+        size_t number_of_edges
+        size_t number_of_subgraph
+        unique_ptr[device_buffer] src_indices
+        unique_ptr[device_buffer] dst_indices
+        unique_ptr[device_buffer] edge_data
+        unique_ptr[device_buffer] subgraph_offsets
+
+    cdef cppclass random_walk_ret_t:
+        size_t coalesced_sz_v_
+        size_t coalesced_sz_w_
+        size_t num_paths_
+        size_t max_depth_
+        unique_ptr[device_buffer] d_coalesced_v_
+        unique_ptr[device_buffer] d_coalesced_w_
+        unique_ptr[device_buffer] d_sizes_
+
+    cdef cppclass random_walk_path_t:
+        unique_ptr[device_buffer] d_v_offsets
+        unique_ptr[device_buffer] d_w_sizes
+        unique_ptr[device_buffer] d_w_offsets
+
+    cdef cppclass graph_generator_t:
+        unique_ptr[device_buffer] d_source
+        unique_ptr[device_buffer] d_destination
+
+cdef extern from "<utility>" namespace "std" nogil:
+    cdef device_buffer move(device_buffer)
+    cdef unique_ptr[device_buffer] move(unique_ptr[device_buffer])
+    cdef cy_multi_edgelists_t move(cy_multi_edgelists_t)
+    cdef unique_ptr[cy_multi_edgelists_t] move(unique_ptr[cy_multi_edgelists_t])
+
+# renumber_edgelist() interface utilities:
+#
+#
+# 1. `cdef extern partition_t`:
+#
+cdef extern from "cugraph/experimental/graph_view.hpp" namespace "cugraph::experimental":
+
+    cdef cppclass partition_t[vertex_t]:
+        pass
+
+
+# 2. return type for shuffle:
+#
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef cppclass major_minor_weights_t[vertex_t, edge_t, weight_t]:
+        major_minor_weights_t(const handle_t &handle)
+        pair[unique_ptr[device_buffer], size_t] get_major_wrap()
+        pair[unique_ptr[device_buffer], size_t] get_minor_wrap()
+        pair[unique_ptr[device_buffer], size_t] get_weights_wrap()
+        unique_ptr[vector[edge_t]] get_edge_counts_wrap()
+
+
+ctypedef fused shuffled_vertices_t:
+    major_minor_weights_t[int, int, float]
+    major_minor_weights_t[int, int, double]
+    major_minor_weights_t[int, long, float]
+    major_minor_weights_t[int, long, double]
+    major_minor_weights_t[long, long, float]
+    major_minor_weights_t[long, long, double]
+
+# 3. return type for renumber:
+#
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef cppclass renum_tuple_t[vertex_t, edge_t]:
+        renum_tuple_t(const handle_t &handle)
+        pair[unique_ptr[device_buffer], size_t] get_dv_wrap()
+        vertex_t& get_num_vertices()
+        edge_t& get_num_edges()
+        vector[vertex_t]& get_segment_offsets()
+        unique_ptr[vector[vertex_t]] get_segment_offsets_wrap()
+        int get_part_row_size()
+        int get_part_col_size()
+        int get_part_comm_rank()
+        unique_ptr[vector[vertex_t]] get_partition_offsets_wrap()
+        pair[vertex_t, vertex_t] get_part_local_vertex_range()
+        vertex_t get_part_local_vertex_first()
+        vertex_t get_part_local_vertex_last()
+        pair[vertex_t, vertex_t] get_part_vertex_partition_range(size_t vertex_partition_idx)
+        vertex_t get_part_vertex_partition_first(size_t vertex_partition_idx)
+        vertex_t get_part_vertex_partition_last(size_t vertex_partition_idx)
+        vertex_t get_part_vertex_partition_size(size_t vertex_partition_idx)
+        size_t get_part_number_of_matrix_partitions()
+        vertex_t get_part_matrix_partition_major_first(size_t partition_idx)
+        vertex_t get_part_matrix_partition_major_last(size_t partition_idx)
+        vertex_t get_part_matrix_partition_major_value_start_offset(size_t partition_idx)
+        pair[vertex_t, vertex_t] get_part_matrix_partition_minor_range()
+        vertex_t get_part_matrix_partition_minor_first()
+        vertex_t get_part_matrix_partition_minor_last()
+
+# 4. `sort_and_shuffle_values()` wrapper:
+#
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef unique_ptr[major_minor_weights_t[vertex_t, edge_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t](
+        const handle_t &handle,
+        vertex_t *edgelist_major_vertices,
+        vertex_t *edgelist_minor_vertices,
+        weight_t* edgelist_weights,
+        edge_t num_edges) except +
+
+# 5. `renumber_edgelist()` wrapper
+#
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef unique_ptr[renum_tuple_t[vertex_t, edge_t]] call_renumber[vertex_t, edge_t](
+        const handle_t &handle,
+        vertex_t *edgelist_major_vertices,
+        vertex_t *edgelist_minor_vertices,
+        const vector[edge_t]& edge_counts,
+        bool do_check,
+        bool multi_gpu) except +
diff --git a/python/cugraph/structure/hypergraph.py b/python/cugraph/structure/hypergraph.py
index a11c937d83d..c5a1ac39e4f 100644
--- a/python/cugraph/structure/hypergraph.py
+++ b/python/cugraph/structure/hypergraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,7 +36,7 @@
 
 import cudf
 import numpy as np
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 
 
 def hypergraph(
@@ -66,24 +66,20 @@ def hypergraph(
     components as dataframes. The transform reveals relationships between the
     rows and unique values. This transform is useful for lists of events,
     samples, relationships, and other structured high-dimensional data.
-
     The transform creates a node for every row, and turns a row's column
     entries into node attributes. If direct=False (default), every unique
     value within a column is also turned into a node. Edges are added to
     connect a row's nodes to each of its column nodes, or if direct=True, to
     one another. Nodes are given the attribute specified by ``NODETYPE``
     that corresponds to the originating column name, or if a row ``EVENTID``.
-
     Consider a list of events. Each row represents a distinct event, and each
     column some metadata about an event. If multiple events have common
     metadata, they will be transitively connected through those metadata
     values. Conversely, if an event has unique metadata, the unique metadata
     will turn into nodes that only have connections to the event node.
-
     For best results, set ``EVENTID`` to a row's unique ID, ``SKIP`` to all
     non-categorical columns (or ``columns`` to all categorical columns),
     and ``categories`` to group columns with the same kinds of values.
-
     Parameters
     ----------
     values : cudf.DataFrame
@@ -130,7 +126,6 @@ def hypergraph(
         The name to use as the node type column in the graph and node DFs.
     EDGETYPE : str, optional, default "edge_type"
         The name to use as the edge type column in the graph and edge DF.
-
     Returns
     -------
     result : dict {"nodes", "edges", "graph", "events", "entities"}
diff --git a/python/cugraph/structure/number_map.py b/python/cugraph/structure/number_map.py
index b9ed8eb2e58..1e3281ca33b 100644
--- a/python/cugraph/structure/number_map.py
+++ b/python/cugraph/structure/number_map.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,100 +11,45 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
-import cudf
+from dask.distributed import wait, default_client
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.structure import renumber_wrapper as c_renumber
+import cugraph.comms.comms as Comms
 import dask_cudf
 import numpy as np
-import bisect
+import cudf
+
+
+def call_renumber(sID,
+                  data,
+                  num_edges,
+                  is_mnmg,
+                  store_transposed):
+    wid = Comms.get_worker_id(sID)
+    handle = Comms.get_handle(sID)
+    return c_renumber.renumber(data[0],
+                               num_edges,
+                               wid,
+                               handle,
+                               is_mnmg,
+                               store_transposed)
 
 
 class NumberMap:
-    """
-    Class used to translate external vertex ids to internal vertex ids
-    in the cuGraph framework.
-
-    Internal vertex ids are assigned by hashing the external vertex ids
-    into a structure to eliminate duplicates, and the resulting list
-    of unique vertices are assigned integers from [0, V) where V is
-    the number of unique vertices.
-
-    In Single GPU mode, internal vertex ids are constructed using
-    cudf functions, with a cudf.DataFrame containing the mapping
-    from external vertex identifiers and internal vertex identifiers
-    allowing for mapping vertex identifiers in either direction.  In
-    this mode, the order of the output from the mapping functions is
-    non-deterministic.  cudf makes no guarantees about order.  If
-    matching the input order is required set the preserve_order
-    to True.
-
-    In Multi GPU mode, internal vertex ids are constucted using
-    dask_cudf functions, with a dask_cudf.DataFrame containing
-    the mapping from external vertex identifiers and internal
-    vertex identifiers allowing for mapping vertex identifiers
-    in either direction.  In this mode, the partitioning of
-    the number_map and the output from any of the mapping functions
-    are non-deterministic.  dask_cudf makes no guarantees about the
-    partitioning or order of the output.  As of this release,
-    there is no mechanism for controlling that, this will be
-    addressed at some point.
-    """
 
     class SingleGPU:
         def __init__(self, df, src_col_names, dst_col_names, id_type,
                      store_transposed):
             self.col_names = NumberMap.compute_vals(src_col_names)
-            self.df = cudf.DataFrame()
+            self.src_col_names = src_col_names
+            self.dst_col_names = dst_col_names
+            self.df = df
             self.id_type = id_type
             self.store_transposed = store_transposed
-
-            source_count = 0
-            dest_count = 0
-
-            if store_transposed:
-                dest_count = 1
-            else:
-                source_count = 1
-
-            tmp = (
-                df[src_col_names]
-                .assign(count=source_count)
-                .groupby(src_col_names)
-                .sum()
-                .reset_index()
-                .rename(
-                    columns=dict(zip(src_col_names, self.col_names)),
-                    copy=False,
-                )
-            )
-
-            if dst_col_names is not None:
-                tmp_dst = (
-                    df[dst_col_names]
-                    .assign(count=dest_count)
-                    .groupby(dst_col_names)
-                    .sum()
-                    .reset_index()
-                )
-                for newname, oldname in zip(self.col_names, dst_col_names):
-                    self.df[newname] = tmp[newname].append(tmp_dst[oldname])
-                self.df['count'] = tmp['count'].append(tmp_dst['count'])
-            else:
-                for newname, oldname in zip(self.col_names, dst_col_names):
-                    self.df[newname] = tmp[newname]
-                self.df['count'] = tmp['count']
-
             self.numbered = False
 
-        def compute(self):
-            if not self.numbered:
-                tmp = self.df.groupby(self.col_names).sum().sort_values(
-                    'count', ascending=False
-                ).reset_index().drop(columns='count')
-
-                tmp["id"] = tmp.index.astype(self.id_type)
-                self.df = tmp
-                self.numbered = True
-
         def to_internal_vertex_id(self, df, col_names):
             tmp_df = df[col_names].rename(
                 columns=dict(zip(col_names, self.col_names)), copy=False
@@ -117,6 +63,25 @@ def to_internal_vertex_id(self, df, col_names):
                 .reset_index()["id"]
             )
 
+        def from_internal_vertex_id(
+            self, df, internal_column_name, external_column_names
+        ):
+            tmp_df = self.df.merge(
+                df,
+                right_on=internal_column_name,
+                left_on="id",
+                how="right",
+            )
+            if internal_column_name != "id":
+                tmp_df = tmp_df.drop(columns=["id"])
+            if external_column_names is None:
+                return tmp_df
+            else:
+                return tmp_df.rename(
+                    columns=dict(zip(self.col_names, external_column_names)),
+                    copy=False,
+                )
+
         def add_internal_vertex_id(self, df, id_column_name, col_names,
                                    drop, preserve_order):
             ret = None
@@ -162,76 +127,39 @@ def add_internal_vertex_id(self, df, id_column_name, col_names,
 
             return ret
 
-        def from_internal_vertex_id(
-            self, df, internal_column_name, external_column_names
-        ):
-            tmp_df = self.df.merge(
-                df,
-                right_on=internal_column_name,
-                left_on="id",
-                how="right",
-            )
-            if internal_column_name != "id":
-                tmp_df = tmp_df.drop(columns=["id"])
-            if external_column_names is None:
-                return tmp_df
-            else:
-                return tmp_df.rename(
-                    columns=dict(zip(self.col_names, external_column_names)),
-                    copy=False,
-                )
-
-    class MultiGPU:
-        def extract_vertices(
-            df, src_col_names, dst_col_names,
-            internal_col_names, store_transposed
-        ):
-            source_count = 0
-            dest_count = 0
-
-            if store_transposed:
-                dest_count = 1
-            else:
-                source_count = 1
+        def indirection_map(self, df, src_col_names, dst_col_names):
+            tmp_df = cudf.DataFrame()
 
-            s = (
+            tmp = (
                 df[src_col_names]
-                .assign(count=source_count)
                 .groupby(src_col_names)
-                .sum()
+                .count()
                 .reset_index()
                 .rename(
-                    columns=dict(zip(src_col_names, internal_col_names)),
+                    columns=dict(zip(src_col_names, self.col_names)),
                     copy=False,
                 )
             )
-            d = None
 
             if dst_col_names is not None:
-                d = (
+                tmp_dst = (
                     df[dst_col_names]
-                    .assign(count=dest_count)
                     .groupby(dst_col_names)
-                    .sum()
+                    .count()
                     .reset_index()
-                    .rename(
-                        columns=dict(zip(dst_col_names, internal_col_names)),
-                        copy=False,
-                    )
                 )
+                for newname, oldname in zip(self.col_names, dst_col_names):
+                    tmp_df[newname] = tmp[newname].append(tmp_dst[oldname])
+            else:
+                for newname in self.col_names:
+                    tmp_df[newname] = tmp[newname]
 
-            reply = cudf.DataFrame()
-
-            for i in internal_col_names:
-                if d is None:
-                    reply[i] = s[i]
-                else:
-                    reply[i] = s[i].append(d[i])
-
-            reply['count'] = s['count'].append(d['count'])
-
-            return reply
+            tmp_df = tmp_df.groupby(self.col_names).count().reset_index()
+            tmp_df["id"] = tmp_df.index.astype(self.id_type)
+            self.df = tmp_df
+            return tmp_df
 
+    class MultiGPU:
         def __init__(
             self, ddf, src_col_names, dst_col_names, id_type, store_transposed
         ):
@@ -239,118 +167,45 @@ def __init__(
             self.val_types = NumberMap.compute_vals_types(ddf, src_col_names)
             self.val_types["count"] = np.int32
             self.id_type = id_type
+            self.ddf = ddf
             self.store_transposed = store_transposed
-            self.ddf = ddf.map_partitions(
-                NumberMap.MultiGPU.extract_vertices,
-                src_col_names,
-                dst_col_names,
-                self.col_names,
-                store_transposed,
-                meta=self.val_types,
-            )
             self.numbered = False
 
-        # Function to compute partitions based on known divisions of the
-        # hash value
-        def compute_partition(df, divisions):
-            sample = df.index[0]
-            partition_id = bisect.bisect_right(divisions, sample) - 1
-            return df.assign(partition=partition_id)
-
-        def assign_internal_identifiers_kernel(
-            local_id, partition, global_id, base_addresses
-        ):
-            for i in range(len(local_id)):
-                global_id[i] = local_id[i] + base_addresses[partition[i]]
-
-        def assign_internal_identifiers(df, base_addresses, id_type):
-            df = df.assign(local_id=df.index.astype(np.int64))
-            df = df.apply_rows(
-                NumberMap.MultiGPU.assign_internal_identifiers_kernel,
-                incols=["local_id", "partition"],
-                outcols={"global_id": id_type},
-                kwargs={"base_addresses": base_addresses},
-            )
-
-            return df.drop(columns=["local_id", "hash", "partition"])
-
-        def assign_global_id(self, ddf, base_addresses, val_types):
-            val_types["global_id"] = self.id_type
-            del val_types["hash"]
-            del val_types["partition"]
-
-            ddf = ddf.map_partitions(
-                lambda df: NumberMap.MultiGPU.assign_internal_identifiers(
-                    df, base_addresses, self.id_type
-                ),
-                meta=val_types,
+        def to_internal_vertex_id(self, ddf, col_names):
+            tmp_ddf = ddf[col_names].rename(
+                columns=dict(zip(col_names, self.col_names)))
+            for name in self.col_names:
+                tmp_ddf[name] = tmp_ddf[name].astype(self.ddf[name].dtype)
+            x = self.ddf.merge(
+                tmp_ddf,
+                on=self.col_names,
+                how="right",
             )
-            return ddf
+            return x['global_id']
 
-        def compute(self):
-            if not self.numbered:
-                val_types = self.val_types
-                val_types["hash"] = np.int32
-
-                vertices = self.ddf.map_partitions(
-                    lambda df: df.assign(hash=df.hash_columns(self.col_names)),
-                    meta=val_types,
-                )
-
-                # Redistribute the ddf based on the hash values
-                rehashed = vertices.set_index("hash", drop=False)
-
-                # Compute the local partition id (obsolete once
-                #   https://github.com/dask/dask/issues/3707 is completed)
-                val_types["partition"] = np.int32
-
-                rehashed_with_partition_id = rehashed.map_partitions(
-                    NumberMap.MultiGPU.compute_partition,
-                    rehashed.divisions,
-                    meta=val_types,
-                )
-
-                val_types.pop('count')
-
-                numbering_map = rehashed_with_partition_id.map_partitions(
-                    lambda df: df.groupby(
-                        self.col_names + ["hash", "partition"]
-                    ).sum()
-                    .sort_values('count', ascending=False)
-                    .reset_index()
-                    .drop(columns='count'),
-                    meta=val_types
-                )
+        def from_internal_vertex_id(
+            self, df, internal_column_name, external_column_names
+        ):
+            tmp_df = self.ddf.merge(
+                df,
+                right_on=internal_column_name,
+                left_on="global_id",
+                how="right"
+            ).map_partitions(lambda df: df.drop(columns="global_id"))
 
-                #
-                #  Compute base address for each partition
-                #
-                counts = numbering_map.map_partitions(
-                    lambda df: df.groupby("partition").count()
-                ).compute()["hash"].to_pandas()
-                base_addresses = np.zeros(len(counts) + 1, self.id_type)
-
-                for i in range(len(counts)):
-                    base_addresses[i + 1] = base_addresses[i] + counts[i]
-
-                #
-                #  Update each partition with the base address
-                #
-                numbering_map = self.assign_global_id(
-                    numbering_map, cudf.Series(base_addresses), val_types
+            if external_column_names is None:
+                return tmp_df
+            else:
+                return tmp_df.map_partitions(
+                    lambda df:
+                    df.rename(
+                        columns=dict(
+                            zip(self.col_names, external_column_names)
+                        ),
+                        copy=False
+                    )
                 )
 
-                self.ddf = numbering_map.persist()
-                self.numbered = True
-
-        def to_internal_vertex_id(self, ddf, col_names):
-            return self.ddf.merge(
-                ddf,
-                right_on=col_names,
-                left_on=self.col_names,
-                how="right",
-            )["global_id"]
-
         def add_internal_vertex_id(self, ddf, id_column_name, col_names, drop,
                                    preserve_order):
             # At the moment, preserve_order cannot be done on
@@ -385,39 +240,49 @@ def add_internal_vertex_id(self, ddf, id_column_name, col_names, drop,
 
             return ret
 
-        def from_internal_vertex_id(
-            self, df, internal_column_name, external_column_names
-        ):
-            tmp_df = self.ddf.merge(
-                df,
-                right_on=internal_column_name,
-                left_on="global_id",
-                how="right"
-            ).map_partitions(lambda df: df.drop(columns="global_id"))
+        def indirection_map(self, ddf, src_col_names, dst_col_names):
 
-            if external_column_names is None:
-                return tmp_df
-            else:
-                return tmp_df.map_partitions(
-                    lambda df:
-                    df.rename(
-                        columns=dict(
-                            zip(self.col_names, external_column_names)
-                        ),
-                        copy=False
-                    )
+            tmp = (
+                ddf[src_col_names]
+                .groupby(src_col_names)
+                .count()
+                .reset_index()
+                .rename(
+                    columns=dict(zip(src_col_names, self.col_names)),
                 )
+            )
+
+            if dst_col_names is not None:
+                tmp_dst = (
+                    ddf[dst_col_names]
+                    .groupby(dst_col_names)
+                    .count()
+                    .reset_index()
+                )
+                for i, (newname, oldname) in enumerate(zip(self.col_names,
+                                                           dst_col_names)):
+                    if i == 0:
+                        tmp_df = tmp[newname].append(tmp_dst[oldname]).\
+                            to_frame(name=newname)
+                    else:
+                        tmp_df[newname] = tmp[newname].append(tmp_dst[oldname])
+            else:
+                for newname in self.col_names:
+                    tmp_df[newname] = tmp[newname]
+            tmp_ddf = tmp_df.groupby(self.col_names).count().reset_index()
+
+            # Set global index
+            tmp_ddf = tmp_ddf.assign(idx=1)
+            tmp_ddf['global_id'] = tmp_ddf.idx.cumsum() - 1
+            tmp_ddf = tmp_ddf.drop(columns='idx')
+            tmp_ddf = tmp_ddf.persist()
+            self.ddf = tmp_ddf
+            return tmp_ddf
 
     def __init__(self, id_type=np.int32):
         self.implementation = None
         self.id_type = id_type
 
-    def aggregate_count_and_partition(df):
-        d = {}
-        d['count'] = df['count'].sum()
-        d['partition'] = df['partition'].min()
-        return cudf.Series(d, index=['count', 'partition'])
-
     def compute_vals_types(df, column_names):
         """
         Helper function to compute internal column names and types
@@ -443,125 +308,19 @@ def compute_vals(column_names):
         """
         return [str(i) for i in range(len(column_names))]
 
-    def from_dataframe(
-            self, df, src_col_names, dst_col_names=None, store_transposed=False
-    ):
-        """
-        Populate the numbering map with vertices from the specified
-        columns of the provided DataFrame.
-
-        Parameters
-        ----------
-        df : cudf.DataFrame or dask_cudf.DataFrame
-            Contains a list of external vertex identifiers that will be
-            numbered by the NumberMap class.
-        src_col_names: list of strings
-            This list of 1 or more strings contain the names
-            of the columns that uniquely identify an external
-            vertex identifier for source vertices
-        dst_col_names: list of strings
-            This list of 1 or more strings contain the names
-            of the columns that uniquely identify an external
-            vertex identifier for destination vertices
-        store_transposed : bool
-            Identify how the graph adjacency will be used.
-            If True, the graph will be organized by destination.
-            If False, the graph will be organized by source
-
-        """
-        if self.implementation is not None:
-            raise Exception("NumberMap is already populated")
-
-        if dst_col_names is not None and len(src_col_names) != len(
-            dst_col_names
-        ):
-            raise Exception(
-                "src_col_names must have same length as dst_col_names"
-            )
-
-        if type(df) is cudf.DataFrame:
-            self.implementation = NumberMap.SingleGPU(
-                df, src_col_names, dst_col_names, self.id_type,
-                store_transposed
-            )
-        elif type(df) is dask_cudf.DataFrame:
-            self.implementation = NumberMap.MultiGPU(
-                df, src_col_names, dst_col_names, self.id_type,
-                store_transposed
-            )
-        else:
-            raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame")
-
-        self.implementation.compute()
-
-    def from_series(self, src_series, dst_series=None, store_transposed=False):
-        """
-        Populate the numbering map with vertices from the specified
-        pair of series objects, one for the source and one for
-        the destination
-
-        Parameters
-        ----------
-        src_series: cudf.Series or dask_cudf.Series
-            Contains a list of external vertex identifiers that will be
-            numbered by the NumberMap class.
-        dst_series: cudf.Series or dask_cudf.Series
-            Contains a list of external vertex identifiers that will be
-            numbered by the NumberMap class.
-        store_transposed : bool
-            Identify how the graph adjacency will be used.
-            If True, the graph will be organized by destination.
-            If False, the graph will be organized by source
-        """
-        if self.implementation is not None:
-            raise Exception("NumberMap is already populated")
-
-        if dst_series is not None and type(src_series) != type(dst_series):
-            raise Exception("src_series and dst_series must have same type")
-
-        if type(src_series) is cudf.Series:
-            dst_series_list = None
-            df = cudf.DataFrame()
-            df["s"] = src_series
-            if dst_series is not None:
-                df["d"] = dst_series
-                dst_series_list = ["d"]
-            self.implementation = NumberMap.SingleGPU(
-                df, ["s"], dst_series_list, self.id_type, store_transposed
-            )
-        elif type(src_series) is dask_cudf.Series:
-            dst_series_list = None
-            df = dask_cudf.DataFrame()
-            df["s"] = src_series
-            if dst_series is not None:
-                df["d"] = dst_series
-                dst_series_list = ["d"]
-            self.implementation = NumberMap.MultiGPU(
-                df, ["s"], dst_series_list, self.id_type, store_transposed
-            )
-        else:
-            raise Exception(
-                "src_series must be cudf.Series or " "dask_cudf.Series"
-            )
-
-        self.implementation.compute()
-
     def to_internal_vertex_id(self, df, col_names=None):
         """
         Given a collection of external vertex ids, return the internal
         vertex ids
-
         Parameters
         ----------
         df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series
             Contains a list of external vertex identifiers that will be
             converted into internal vertex identifiers
-
         col_names: (optional) list of strings
             This list of 1 or more strings contain the names
             of the columns that uniquely identify an external
             vertex identifier
-
         Returns
         ---------
         vertex_ids : cudf.Series or dask_cudf.Series
@@ -569,7 +328,6 @@ def to_internal_vertex_id(self, df, col_names=None):
             does not guarantee order or partitioning (in the case of
             dask_cudf) of vertex ids. If order matters use
             add_internal_vertex_id
-
         """
         tmp_df = None
         tmp_col_names = None
@@ -587,11 +345,7 @@ def to_internal_vertex_id(self, df, col_names=None):
 
         reply = self.implementation.to_internal_vertex_id(tmp_df,
                                                           tmp_col_names)
-
-        if type(df) in [cudf.DataFrame, dask_cudf.DataFrame]:
-            return reply["0"]
-        else:
-            return reply
+        return reply
 
     def add_internal_vertex_id(
         self, df, id_column_name="id", col_names=None, drop=False,
@@ -600,34 +354,27 @@ def add_internal_vertex_id(
         """
         Given a collection of external vertex ids, return the internal vertex
         ids combined with the input data.
-
         If a series-type input is provided then the series will be in a column
         named '0'. Otherwise the input column names in the DataFrame will be
         preserved.
-
         Parameters
         ----------
         df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series
             Contains a list of external vertex identifiers that will be
             converted into internal vertex identifiers
-
         id_column_name: (optional) string
             The name to be applied to the column containing the id
             (defaults to 'id')
-
         col_names: (optional) list of strings
             This list of 1 or more strings contain the names
             of the columns that uniquely identify an external
             vertex identifier
-
         drop: (optional) boolean
             If True, drop the column names specified in col_names from
             the returned DataFrame.  Defaults to False.
-
         preserve_order: (optional) boolean
             If True, do extra sorting work to preserve the order
             of the input DataFrame.  Defaults to False.
-
         Returns
         ---------
         df : cudf.DataFrame or dask_cudf.DataFrame
@@ -635,7 +382,6 @@ def add_internal_vertex_id(
             with an additional column containing the internal vertex id.
             Note that there is no guarantee of the order or partitioning
             of elements in the returned DataFrame.
-
         """
         tmp_df = None
         tmp_col_names = None
@@ -671,7 +417,6 @@ def from_internal_vertex_id(
         """
         Given a collection of internal vertex ids, return a DataFrame of
         the external vertex ids
-
         Parameters
         ----------
         df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series
@@ -681,20 +426,16 @@ def from_internal_vertex_id(
             in a column labeled 'id'.  If df is a dataframe type object
             then internal_column_name should identify which column corresponds
             the the internal vertex id that should be converted
-
         internal_column_name: (optional) string
             Name of the column containing the internal vertex id.
             If df is a series then this parameter is ignored.  If df is
             a DataFrame this parameter is required.
-
         external_column_names: (optional) string or list of strings
             Name of the columns that define an external vertex id.
             If not specified, columns will be labeled '0', '1,', ..., 'n-1'
-
         drop: (optional) boolean
             If True the internal column name will be dropped from the
             DataFrame.  Defaults to False.
-
         Returns
         ---------
         df : cudf.DataFrame or dask_cudf.DataFrame
@@ -727,109 +468,143 @@ def from_internal_vertex_id(
 
         return output_df
 
-    def column_names(self):
-        """
-        Return the list of internal column names
-
-        Returns
-        ----------
-            List of column names ('0', '1', ..., 'n-1')
-        """
-        return self.implementation.col_names
-
-    def renumber(df, src_col_names, dst_col_names, preserve_order=False,
-                 store_transposed=False):
-        """
-        Given a single GPU or distributed DataFrame, use src_col_names and
-        dst_col_names to identify the source vertex identifiers and destination
-        vertex identifiers, respectively.
-
-        Internal vertex identifiers will be created, numbering vertices as
-        integers starting from 0.
-
-        The function will return a DataFrame containing the original dataframe
-        contents with a new column labeled 'src' containing the renumbered
-        source vertices and a new column labeled 'dst' containing the
-        renumbered dest vertices, along with a NumberMap object that contains
-        the number map for the numbering that was used.
-
-        Note that this function does not guarantee order in single GPU mode,
-        and does not guarantee order or partitioning in multi-GPU mode.  If you
-        wish to preserve ordering, add an index column to df and sort the
-        return by that index column.
-
-        Parameters
-        ----------
-        df: cudf.DataFrame or dask_cudf.DataFrame
-            Contains a list of external vertex identifiers that will be
-            numbered by the NumberMap class.
-        src_col_names: string or list of strings
-            This list of 1 or more strings contain the names
-            of the columns that uniquely identify an external
-            vertex identifier for source vertices
-        dst_col_names: string or list of strings
-            This list of 1 or more strings contain the names
-            of the columns that uniquely identify an external
-            vertex identifier for destination vertices
-        store_transposed : bool
-            Identify how the graph adjacency will be used.
-            If True, the graph will be organized by destination.
-            If False, the graph will be organized by source
-
-        Returns
-        ---------
-        df : cudf.DataFrame or dask_cudf.DataFrame
-            The original DataFrame columns exist unmodified.  Columns
-            are added to the DataFrame to identify the external vertex
-            identifiers. If external_columns is specified, these names
-            are used as the names of the output columns.  If external_columns
-            is not specifed the columns are labeled '0', ... 'n-1' based on
-            the number of columns identifying the external vertex identifiers.
-
-        number_map : NumberMap
-            The number map object object that retains the mapping between
-            internal vertex identifiers and external vertex identifiers.
+    def renumber_and_segment(
+        df, src_col_names, dst_col_names, preserve_order=False,
+        store_transposed=False
+    ):
+        if isinstance(src_col_names, list):
+            renumber_type = 'legacy'
+        elif not (df[src_col_names].dtype == np.int32 or
+                  df[src_col_names].dtype == np.int64):
+            renumber_type = 'legacy'
+        else:
+            renumber_type = 'experimental'
 
-        Examples
-        --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>>
-        >>> df, number_map = NumberMap.renumber(df, '0', '1')
-        >>>
-        >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(df, 'src', 'dst')
-        """
         renumber_map = NumberMap()
-
-        if isinstance(src_col_names, list):
-            renumber_map.from_dataframe(df, src_col_names, dst_col_names)
-            df = renumber_map.add_internal_vertex_id(
-                df, "src", src_col_names, drop=True,
-                preserve_order=preserve_order
+        if not isinstance(src_col_names, list):
+            src_col_names = [src_col_names]
+            dst_col_names = [dst_col_names]
+        if type(df) is cudf.DataFrame:
+            renumber_map.implementation = NumberMap.SingleGPU(
+                df, src_col_names, dst_col_names, renumber_map.id_type,
+                store_transposed
             )
-            df = renumber_map.add_internal_vertex_id(
-                df, "dst", dst_col_names, drop=True,
-                preserve_order=preserve_order
+        elif type(df) is dask_cudf.DataFrame:
+            renumber_map.implementation = NumberMap.MultiGPU(
+                df, src_col_names, dst_col_names, renumber_map.id_type,
+                store_transposed
             )
         else:
-            renumber_map.from_dataframe(df, [src_col_names], [dst_col_names])
+            raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame")
+
+        if renumber_type == 'legacy':
+            indirection_map = renumber_map.implementation.\
+                              indirection_map(df,
+                                              src_col_names,
+                                              dst_col_names)
             df = renumber_map.add_internal_vertex_id(
                 df, "src", src_col_names, drop=True,
                 preserve_order=preserve_order
             )
-
             df = renumber_map.add_internal_vertex_id(
                 df, "dst", dst_col_names, drop=True,
                 preserve_order=preserve_order
             )
+        else:
+            df = df.rename(columns={src_col_names[0]: "src",
+                                    dst_col_names[0]: "dst"})
 
-        if type(df) is dask_cudf.DataFrame:
-            df = df.persist()
+        num_edges = len(df)
 
-        return df, renumber_map
+        if isinstance(df, dask_cudf.DataFrame):
+            is_mnmg = True
+        else:
+            is_mnmg = False
+
+        if is_mnmg:
+            client = default_client()
+            data = get_distributed_data(df)
+            result = [(client.submit(call_renumber,
+                                     Comms.get_session_id(),
+                                     wf[1],
+                                     num_edges,
+                                     is_mnmg,
+                                     store_transposed,
+                                     workers=[wf[0]]), wf[0])
+                      for idx, wf in enumerate(data.worker_to_parts.items())]
+            wait(result)
+
+            def get_renumber_map(data):
+                return data[0]
+
+            def get_segment_offsets(data):
+                return data[1]
+
+            def get_renumbered_df(data):
+                return data[2]
+
+            renumbering_map = dask_cudf.from_delayed(
+                                 [client.submit(get_renumber_map,
+                                                data,
+                                                workers=[wf])
+                                     for (data, wf) in result])
+
+            list_of_segment_offsets = client.gather(
+                                          [client.submit(get_segment_offsets,
+                                                         data,
+                                                         workers=[wf])
+                                              for (data, wf) in result])
+            aggregate_segment_offsets = []
+            for segment_offsets in list_of_segment_offsets:
+                aggregate_segment_offsets.extend(segment_offsets)
+
+            renumbered_df = dask_cudf.from_delayed(
+                               [client.submit(get_renumbered_df,
+                                              data,
+                                              workers=[wf])
+                                   for (data, wf) in result])
+            if renumber_type == 'legacy':
+                renumber_map.implementation.ddf = indirection_map.merge(
+                    renumbering_map,
+                    right_on='original_ids', left_on='global_id',
+                    how='right').\
+                    drop(columns=['global_id', 'original_ids'])\
+                    .rename(columns={'new_ids': 'global_id'})
+            else:
+                renumber_map.implementation.ddf = renumbering_map.rename(
+                    columns={'original_ids': '0', 'new_ids': 'global_id'})
+            renumber_map.implementation.numbered = True
+            return renumbered_df, renumber_map, aggregate_segment_offsets
+
+        else:
+            renumbering_map, segment_offsets, renumbered_df = \
+                c_renumber.renumber(df,
+                                    num_edges,
+                                    0,
+                                    Comms.get_default_handle(),
+                                    is_mnmg,
+                                    store_transposed)
+            if renumber_type == 'legacy':
+                renumber_map.implementation.df = indirection_map.\
+                    merge(renumbering_map,
+                          right_on='original_ids', left_on='id').\
+                    drop(columns=['id', 'original_ids'])\
+                    .rename(columns={'new_ids': 'id'}, copy=False)
+            else:
+                renumber_map.implementation.df = renumbering_map.rename(
+                    columns={'original_ids': '0', 'new_ids': 'id'}, copy=False)
+
+            renumber_map.implementation.numbered = True
+            return renumbered_df, renumber_map, segment_offsets
+
+    def renumber(df, src_col_names, dst_col_names, preserve_order=False,
+                 store_transposed=False):
+        return NumberMap.renumber_and_segment(
+            df, src_col_names, dst_col_names,
+            preserve_order, store_transposed)[0:2]
 
-    def unrenumber(self, df, column_name, preserve_order=False):
+    def unrenumber(self, df, column_name, preserve_order=False,
+                   get_column_names=False):
         """
         Given a DataFrame containing internal vertex ids in the identified
         column, replace this with external vertex ids.  If the renumbering
@@ -837,30 +612,29 @@ def unrenumber(self, df, column_name, preserve_order=False):
         name for the external vertex identifiers.  If the renumbering is from
         a multi-column input, the output columns will be labeled 0 through
         n-1 with a suffix of _column_name.
-
         Note that this function does not guarantee order or partitioning in
         multi-GPU mode.
-
         Parameters
         ----------
         df: cudf.DataFrame or dask_cudf.DataFrame
             A DataFrame containing internal vertex identifiers that will be
             converted into external vertex identifiers.
-
         column_name: string
             Name of the column containing the internal vertex id.
-
         preserve_order: (optional) bool
             If True, preserve the ourder of the rows in the output
             DataFrame to match the input DataFrame
-
+        get_column_names: (optional) bool
+            If True, the unrenumbered column names are returned.
         Returns
         ---------
         df : cudf.DataFrame or dask_cudf.DataFrame
             The original DataFrame columns exist unmodified.  The external
             vertex identifiers are added to the DataFrame, the internal
             vertex identifier column is removed from the dataframe.
-
+        column_names: string or list of strings
+            If get_column_names is True, the unrenumbered column names are
+            returned.
         Examples
         --------
         >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
@@ -880,11 +654,13 @@ def unrenumber(self, df, column_name, preserve_order=False):
         if len(self.implementation.col_names) == 1:
             # Output will be renamed to match input
             mapping = {"0": column_name}
+            col_names = column_name
         else:
             # Output will be renamed to ${i}_${column_name}
             mapping = {}
             for nm in self.implementation.col_names:
                 mapping[nm] = nm + "_" + column_name
+            col_names = list(mapping.values())
 
         if preserve_order:
             index_name = NumberMap.generate_unused_column_name(df)
@@ -895,11 +671,18 @@ def unrenumber(self, df, column_name, preserve_order=False):
         if preserve_order:
             df = df.sort_values(
                 index_name
-            ).drop(index_name).reset_index(drop=True)
+            ).drop(columns=index_name).reset_index(drop=True)
 
         if type(df) is dask_cudf.DataFrame:
-            return df.map_partitions(
+            df = df.map_partitions(
                 lambda df: df.rename(columns=mapping, copy=False)
             )
         else:
-            return df.rename(columns=mapping, copy=False)
+            df = df.rename(columns=mapping, copy=False)
+        if get_column_names:
+            return df, col_names
+        else:
+            return df
+
+    def vertex_column_size(self):
+        return len(self.implementation.col_names)
diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx
new file mode 100644
index 00000000000..4433bbb09cb
--- /dev/null
+++ b/python/cugraph/structure/renumber_wrapper.pyx
@@ -0,0 +1,566 @@
+#
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cugraph.structure.utils_wrapper import *
+import cudf
+from cugraph.structure.graph_utilities cimport *
+import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+import numpy as np
+
+from libcpp.memory cimport make_unique
+from libcpp.utility cimport move
+from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer
+
+cdef renumber_helper(shuffled_vertices_t* ptr_maj_min_w, vertex_t, weights):
+    # extract shuffled result:
+    #
+    cdef pair[unique_ptr[device_buffer], size_t] pair_s_major   = deref(ptr_maj_min_w).get_major_wrap()
+    cdef pair[unique_ptr[device_buffer], size_t] pair_s_minor   = deref(ptr_maj_min_w).get_minor_wrap()
+    cdef pair[unique_ptr[device_buffer], size_t] pair_s_weights = deref(ptr_maj_min_w).get_weights_wrap()
+    
+    shuffled_major_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_s_major.first))
+    shuffled_major_buffer = Buffer(shuffled_major_buffer)
+    
+    shuffled_major_series = cudf.Series(data=shuffled_major_buffer, dtype=vertex_t)
+    
+    shuffled_minor_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_s_minor.first))
+    shuffled_minor_buffer = Buffer(shuffled_minor_buffer)
+    
+    shuffled_minor_series = cudf.Series(data=shuffled_minor_buffer, dtype=vertex_t)
+
+    shuffled_df = cudf.DataFrame()
+    shuffled_df['major_vertices']=shuffled_major_series
+    shuffled_df['minor_vertices']=shuffled_minor_series
+
+    if weights is not None:
+        weight_t = weights.dtype
+        shuffled_weights_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_s_weights.first))
+        shuffled_weights_buffer = Buffer(shuffled_weights_buffer)
+    
+        shuffled_weights_series = cudf.Series(data=shuffled_weights_buffer, dtype=weight_t)
+    
+        shuffled_df['value']= shuffled_weights_series
+    
+    return shuffled_df
+
+
+def renumber(input_df,           # maybe use cpdef ?
+             num_global_edges,    
+             rank,
+             handle,
+             is_multi_gpu,
+             transposed):
+    """
+    Call MNMG renumber
+    """
+    cdef size_t handle_size_t = <size_t>handle.getHandle()
+    # TODO: get handle_t out of handle...
+    handle_ptr = <handle_t*>handle_size_t
+
+    if not transposed:
+        major_vertices = input_df['src']
+        minor_vertices = input_df['dst']
+    else:
+        major_vertices = input_df['dst']
+        minor_vertices = input_df['src']
+
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL # set below...
+    
+    vertex_t = major_vertices.dtype
+    if num_global_edges > (2**31 - 1):
+        edge_t = np.dtype("int64")
+    else:
+        edge_t = vertex_t
+    if "value" in input_df.columns:
+        weights = input_df['value']
+        weight_t = weights.dtype
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
+    else:
+        weights = None
+        weight_t = np.dtype("float32")
+        
+    if (vertex_t != np.dtype("int32") and vertex_t != np.dtype("int64")):
+        raise Exception("Incorrect vertex_t type.")
+    if (edge_t != np.dtype("int32") and edge_t != np.dtype("int64")):
+        raise Exception("Incorrect edge_t type.")
+    if (weight_t != np.dtype("float32") and weight_t != np.dtype("float64")):
+        raise Exception("Incorrect weight_t type.")
+    if (vertex_t != np.dtype("int32") and edge_t != np.dtype("int64")):
+        raise Exception("Incompatible vertex_t and edge_t types.")
+
+    # FIXME: needs to be edge_t type not int
+    cdef int num_local_edges = len(major_vertices)
+
+    cdef uintptr_t c_major_vertices = major_vertices.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_minor_vertices = minor_vertices.__cuda_array_interface__['data'][0]
+
+    cdef uintptr_t shuffled_major = <uintptr_t>NULL
+    cdef uintptr_t shuffled_minor = <uintptr_t>NULL
+    
+    # FIXME: Fix fails when do_check = True
+    cdef bool do_check = False # ? for now...
+    cdef bool mg_flag = is_multi_gpu # run Single-GPU or MNMG
+
+    cdef pair[unique_ptr[device_buffer], size_t] pair_original
+
+    # tparams: vertex_t, edge_t, weight_t:
+    #
+    cdef unique_ptr[major_minor_weights_t[int, int, float]] ptr_shuffled_32_32_32
+    cdef unique_ptr[major_minor_weights_t[int, int, double]] ptr_shuffled_32_32_64
+    cdef unique_ptr[major_minor_weights_t[int, long, float]] ptr_shuffled_32_64_32
+    cdef unique_ptr[major_minor_weights_t[int, long, double]] ptr_shuffled_32_64_64
+    cdef unique_ptr[major_minor_weights_t[long, long, float]] ptr_shuffled_64_64_32
+    cdef unique_ptr[major_minor_weights_t[long, long, double]] ptr_shuffled_64_64_64
+
+    # tparams: vertex_t, edge_t:
+    #
+    cdef unique_ptr[renum_tuple_t[int, int]] ptr_renum_tuple_32_32
+    cdef unique_ptr[renum_tuple_t[int, long]] ptr_renum_tuple_32_64
+    cdef unique_ptr[renum_tuple_t[long, long]] ptr_renum_tuple_64_64
+
+    # tparam: vertex_t:
+    #
+    cdef unique_ptr[vector[int]] edge_counts_32
+    cdef unique_ptr[vector[long]] edge_counts_64
+
+    # tparam: vertex_t:
+    #
+    cdef unique_ptr[vector[int]] uniq_partition_vector_32
+    cdef unique_ptr[vector[long]] uniq_partition_vector_64
+
+    # tparam: vertex_t:
+    #
+    cdef unique_ptr[vector[int]] uniq_segment_vector_32
+    cdef unique_ptr[vector[long]] uniq_segment_vector_64
+
+    cdef size_t rank_indx = <size_t>rank
+    
+    if (vertex_t == np.dtype("int32")):
+        if ( edge_t == np.dtype("int32")):
+            if( weight_t == np.dtype("float32")):
+                if(is_multi_gpu):
+                    ptr_shuffled_32_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr),
+                                                                           <int*>c_major_vertices,
+                                                                           <int*>c_minor_vertices,
+                                                                           <float*>c_edge_weights,
+                                                                           num_local_edges).release())
+                    shuffled_df = renumber_helper(ptr_shuffled_32_32_32.get(), vertex_t, weights)
+                    major_vertices = shuffled_df['major_vertices']
+                    minor_vertices = shuffled_df['minor_vertices']
+                    num_local_edges = len(shuffled_df)
+                    if not transposed:
+                        major = 'src'; minor = 'dst'
+                    else:
+                        major = 'dst'; minor = 'src'
+                    shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap())
+                else:
+                    shuffled_df = input_df
+                    edge_counts_32 = make_unique[vector[int]](1, num_local_edges)
+                      
+                shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
+                shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+
+                ptr_renum_tuple_32_32.reset(call_renumber[int, int](deref(handle_ptr),
+                                                                    <int*>shuffled_major,
+                                                                    <int*>shuffled_minor,
+                                                                    deref(edge_counts_32.get()),
+                                                                    do_check,
+                                                                    mg_flag).release())
+                
+                pair_original = ptr_renum_tuple_32_32.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_tuple_32_32.get().get_partition_offsets_wrap())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, ptr_renum_tuple_32_32.get().get_num_vertices()),
+                                             dtype=vertex_t)                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                uniq_segment_vector_32 = move(ptr_renum_tuple_32_32.get().get_segment_offsets_wrap())
+                segment_offsets = [None] * <Py_ssize_t>(deref(uniq_segment_vector_32).size())
+                for i in range(len(segment_offsets)):
+                  segment_offsets[i] = deref(uniq_segment_vector_32)[i]
+
+                return renumbered_map, segment_offsets, shuffled_df
+
+            elif( weight_t == np.dtype("float64")):
+                if(is_multi_gpu):
+                    ptr_shuffled_32_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr),
+                                                                            <int*>c_major_vertices,
+                                                                            <int*>c_minor_vertices,
+                                                                            <double*>c_edge_weights,
+                                                                            num_local_edges).release())
+                
+                    shuffled_df = renumber_helper(ptr_shuffled_32_32_64.get(), vertex_t, weights)
+                    major_vertices = shuffled_df['major_vertices']
+                    minor_vertices = shuffled_df['minor_vertices']
+                    num_local_edges = len(shuffled_df)
+                    if not transposed:
+                        major = 'src'; minor = 'dst'
+                    else:
+                        major = 'dst'; minor = 'src'
+                    shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap())
+                else:
+                    shuffled_df = input_df
+                    edge_counts_32 = make_unique[vector[int]](1, num_local_edges)
+      
+                shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
+                shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_tuple_32_32.reset(call_renumber[int, int](deref(handle_ptr),
+                                                                    <int*>shuffled_major,
+                                                                    <int*>shuffled_minor,
+                                                                    deref(edge_counts_32.get()),
+                                                                    do_check,
+                                                                    mg_flag).release())
+                
+                pair_original = ptr_renum_tuple_32_32.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_tuple_32_32.get().get_partition_offsets_wrap())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, ptr_renum_tuple_32_32.get().get_num_vertices()),
+                                             dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                uniq_segment_vector_32 = move(ptr_renum_tuple_32_32.get().get_segment_offsets_wrap())
+                segment_offsets = [None] * <Py_ssize_t>(deref(uniq_segment_vector_32).size())
+                for i in range(len(segment_offsets)):
+                  segment_offsets[i] = deref(uniq_segment_vector_32)[i]
+
+                return renumbered_map, segment_offsets, shuffled_df
+
+        elif ( edge_t == np.dtype("int64")):
+            if( weight_t == np.dtype("float32")):
+                if(is_multi_gpu):
+                    ptr_shuffled_32_64_32.reset(call_shuffle[int, long, float](deref(handle_ptr),
+                                                                            <int*>c_major_vertices,
+                                                                            <int*>c_minor_vertices,
+                                                                            <float*>c_edge_weights,
+                                                                            num_local_edges).release())
+                
+                    shuffled_df = renumber_helper(ptr_shuffled_32_64_32.get(), vertex_t, weights)
+                    major_vertices = shuffled_df['major_vertices']
+                    minor_vertices = shuffled_df['minor_vertices']
+                    num_local_edges = len(shuffled_df)
+                    if not transposed:
+                        major = 'src'; minor = 'dst'
+                    else:
+                        major = 'dst'; minor = 'src'
+                    shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap())
+                else:
+                    shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
+                 
+                shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
+                shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_tuple_32_64.reset(call_renumber[int, long](deref(handle_ptr),
+                                                                     <int*>shuffled_major,
+                                                                     <int*>shuffled_minor,
+                                                                     deref(edge_counts_64.get()),
+                                                                     do_check,
+                                                                     mg_flag).release())
+                
+                pair_original = ptr_renum_tuple_32_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_tuple_32_64.get().get_partition_offsets_wrap())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, ptr_renum_tuple_32_64.get().get_num_vertices()),
+                                             dtype=vertex_t)
+               
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                uniq_segment_vector_32 = move(ptr_renum_tuple_32_64.get().get_segment_offsets_wrap())
+                segment_offsets = [None] * <Py_ssize_t>(deref(uniq_segment_vector_32).size())
+                for i in range(len(segment_offsets)):
+                  segment_offsets[i] = deref(uniq_segment_vector_32)[i]
+
+                return renumbered_map, segment_offsets, shuffled_df
+            elif( weight_t == np.dtype("float64")):
+                if(is_multi_gpu):
+                    ptr_shuffled_32_64_64.reset(call_shuffle[int, long, double](deref(handle_ptr),
+                                                                             <int*>c_major_vertices,
+                                                                             <int*>c_minor_vertices,
+                                                                             <double*>c_edge_weights,
+                                                                             num_local_edges).release())
+                
+                    shuffled_df = renumber_helper(ptr_shuffled_32_64_64.get(), vertex_t, weights)
+                    major_vertices = shuffled_df['major_vertices']
+                    minor_vertices = shuffled_df['minor_vertices']
+                    num_local_edges = len(shuffled_df)
+                    if not transposed:
+                        major = 'src'; minor = 'dst'
+                    else:
+                        major = 'dst'; minor = 'src'
+                    shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap())
+                else:
+                    shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
+                                       
+                shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
+                shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_tuple_32_64.reset(call_renumber[int, long](deref(handle_ptr),
+                                                                     <int*>shuffled_major,
+                                                                     <int*>shuffled_minor,
+                                                                     deref(edge_counts_64.get()),
+                                                                     do_check,
+                                                                     mg_flag).release())
+                
+                pair_original = ptr_renum_tuple_32_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_tuple_32_64.get().get_partition_offsets_wrap())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, ptr_renum_tuple_32_64.get().get_num_vertices()),
+                                             dtype=vertex_t)                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                uniq_segment_vector_32 = move(ptr_renum_tuple_32_64.get().get_segment_offsets_wrap())
+                segment_offsets = [None] * <Py_ssize_t>(deref(uniq_segment_vector_32).size())
+                for i in range(len(segment_offsets)):
+                  segment_offsets[i] = deref(uniq_segment_vector_32)[i]
+
+                return renumbered_map, segment_offsets, shuffled_df
+
+    elif (vertex_t == np.dtype("int64")):
+        if ( edge_t == np.dtype("int64")):
+            if( weight_t == np.dtype("float32")):
+                if(is_multi_gpu):
+                    ptr_shuffled_64_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr),
+                                                                            <long*>c_major_vertices,
+                                                                            <long*>c_minor_vertices,
+                                                                            <float*>c_edge_weights,
+                                                                            num_local_edges).release())
+                
+                    shuffled_df = renumber_helper(ptr_shuffled_64_64_32.get(), vertex_t, weights)
+                    major_vertices = shuffled_df['major_vertices']
+                    minor_vertices = shuffled_df['minor_vertices']
+                    num_local_edges = len(shuffled_df)
+                    if not transposed:
+                        major = 'src'; minor = 'dst'
+                    else:
+                        major = 'dst'; minor = 'src'
+                    shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap())
+                else:
+                    shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
+
+                shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
+                shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_tuple_64_64.reset(call_renumber[long, long](deref(handle_ptr),
+                                                                      <long*>shuffled_major,
+                                                                      <long*>shuffled_minor,
+                                                                      deref(edge_counts_64.get()),
+                                                                      do_check,
+                                                                      mg_flag).release())
+                
+                pair_original = ptr_renum_tuple_64_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_64 = move(ptr_renum_tuple_64_64.get().get_partition_offsets_wrap())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_64.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, ptr_renum_tuple_64_64.get().get_num_vertices()),
+                                             dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                uniq_segment_vector_64 = move(ptr_renum_tuple_64_64.get().get_segment_offsets_wrap())
+                segment_offsets = [None] * <Py_ssize_t>(deref(uniq_segment_vector_64).size())
+                for i in range(len(segment_offsets)):
+                  segment_offsets[i] = deref(uniq_segment_vector_64)[i]
+
+                return renumbered_map, segment_offsets, shuffled_df
+
+            elif( weight_t == np.dtype("float64")):
+                if(is_multi_gpu):
+                    ptr_shuffled_64_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr),
+                                                                              <long*>c_major_vertices,
+                                                                              <long*>c_minor_vertices,
+                                                                              <double*>c_edge_weights,
+                                                                              num_local_edges).release())
+                
+                    shuffled_df = renumber_helper(ptr_shuffled_64_64_64.get(), vertex_t, weights)
+                    major_vertices = shuffled_df['major_vertices']
+                    minor_vertices = shuffled_df['minor_vertices']
+                    num_local_edges = len(shuffled_df)
+                    if not transposed:
+                        major = 'src'; minor = 'dst'
+                    else:
+                        major = 'dst'; minor = 'src'
+                    shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap())
+                else:
+                    shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
+
+                shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
+                shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_tuple_64_64.reset(call_renumber[long, long](deref(handle_ptr),
+                                                                      <long*>shuffled_major,
+                                                                      <long*>shuffled_minor,
+                                                                      deref(edge_counts_64.get()),
+                                                                      do_check,
+                                                                      mg_flag).release())
+                
+                pair_original = ptr_renum_tuple_64_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_64 = move(ptr_renum_tuple_64_64.get().get_partition_offsets_wrap())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_64.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, ptr_renum_tuple_64_64.get().get_num_vertices()),
+                                             dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                uniq_segment_vector_64 = move(ptr_renum_tuple_64_64.get().get_segment_offsets_wrap())
+                segment_offsets = [None] * <Py_ssize_t>(deref(uniq_segment_vector_64).size())
+                for i in range(len(segment_offsets)):
+                  segment_offsets[i] = deref(uniq_segment_vector_64)[i]
+
+                return renumbered_map, segment_offsets, shuffled_df
diff --git a/python/cugraph/structure/symmetrize.py b/python/cugraph/structure/symmetrize.py
index e7fd15144aa..13116eabb07 100644
--- a/python/cugraph/structure/symmetrize.py
+++ b/python/cugraph/structure/symmetrize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,12 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure import graph as csg
+from cugraph.structure import graph_classes as csg
 import cudf
 import dask_cudf
+from cugraph.comms import comms as Comms
 
 
-def symmetrize_df(df, src_name, dst_name):
+def symmetrize_df(df, src_name, dst_name, multi=False, symmetrize=True):
     """
     Take a COO stored in a DataFrame, along with the column names of
     the source and destination columns and create a new data frame
@@ -32,6 +33,7 @@ def symmetrize_df(df, src_name, dst_name):
     != data2 then this code will arbitrarily pick the smaller data
     element to keep, if this is not desired then the caller should
     should correct the data prior to calling symmetrize.
+
     Parameters
     ----------
     df : cudf.DataFrame
@@ -42,6 +44,13 @@ def symmetrize_df(df, src_name, dst_name):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
+    multi : bool
+        Set to True if graph is a Multi(Di)Graph. This allows multiple
+        edges instead of dropping them.
+    symmetrize : bool
+        Default is True to perform symmetrization. If False only duplicate
+        edges are dropped.
+
     Examples
     --------
     >>> import cugraph.dask as dcg
@@ -54,26 +63,30 @@ def symmetrize_df(df, src_name, dst_name):
     >>> sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight")
     >>> Comms.destroy()
     """
-    gdf = cudf.DataFrame()
-
     #
     #  Now append the columns.  We add sources to the end of destinations,
     #  and destinations to the end of sources.  Otherwise we append a
     #  column onto itself.
     #
-    for idx, name in enumerate(df.columns):
-        if name == src_name:
-            gdf[src_name] = df[src_name].append(
-                df[dst_name], ignore_index=True
-            )
-        elif name == dst_name:
-            gdf[dst_name] = df[dst_name].append(
-                df[src_name], ignore_index=True
-            )
-        else:
-            gdf[name] = df[name].append(df[name], ignore_index=True)
-
-    return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
+    if symmetrize:
+        gdf = cudf.DataFrame()
+        for idx, name in enumerate(df.columns):
+            if name == src_name:
+                gdf[src_name] = df[src_name].append(
+                    df[dst_name], ignore_index=True
+                )
+            elif name == dst_name:
+                gdf[dst_name] = df[dst_name].append(
+                    df[src_name], ignore_index=True
+                )
+            else:
+                gdf[name] = df[name].append(df[name], ignore_index=True)
+    else:
+        gdf = df
+    if multi:
+        return gdf
+    else:
+        return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
 
 
 def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
@@ -105,6 +118,12 @@ def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
+    multi : bool
+        Set to True if graph is a Multi(Di)Graph. This allows multiple
+        edges instead of dropping them.
+    symmetrize : bool
+        Default is True to perform symmetrization. If False only duplicate
+        edges are dropped.
 
     Examples
     --------
@@ -118,18 +137,19 @@ def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
     else:
         ddf2 = df[[dst_name, src_name]]
         ddf2.columns = [src_name, dst_name]
-
+    worker_list = Comms.get_workers()
+    num_workers = len(worker_list)
     ddf = df.append(ddf2).reset_index(drop=True)
-    result = (
-        ddf.groupby(by=[src_name, dst_name], as_index=False)
-        .min()
-        .reset_index()
-    )
+    result = ddf.shuffle(on=[
+        src_name, dst_name], ignore_index=True, npartitions=num_workers)
+    result = result.map_partitions(lambda x: x.groupby(
+        by=[src_name, dst_name], as_index=False).min().reset_index(drop=True))
 
     return result
 
 
-def symmetrize(source_col, dest_col, value_col=None):
+def symmetrize(source_col, dest_col, value_col=None, multi=False,
+               symmetrize=True):
     """
     Take a COO set of source destination pairs along with associated values
     stored in a single GPU or distributed
@@ -182,20 +202,31 @@ def symmetrize(source_col, dest_col, value_col=None):
         csg.null_check(source_col)
         csg.null_check(dest_col)
     if value_col is not None:
-        weight_name = "value"
-        input_df.insert(len(input_df.columns), "value", value_col)
+        if isinstance(value_col, cudf.Series):
+            weight_name = "value"
+            input_df.insert(len(input_df.columns), "value", value_col)
+        elif isinstance(value_col, cudf.DataFrame):
+            input_df = cudf.concat([input_df, value_col], axis=1)
+
     output_df = None
     if type(source_col) is dask_cudf.Series:
         output_df = symmetrize_ddf(
             input_df, "source", "destination", weight_name
         ).persist()
     else:
-        output_df = symmetrize_df(input_df, "source", "destination")
-
+        output_df = symmetrize_df(input_df, "source", "destination", multi,
+                                  symmetrize)
     if value_col is not None:
-        return (
-            output_df["source"],
-            output_df["destination"],
-            output_df["value"],
-        )
+        if isinstance(value_col, cudf.Series):
+            return (
+                output_df["source"],
+                output_df["destination"],
+                output_df["value"],
+            )
+        elif isinstance(value_col, cudf.DataFrame):
+            return (
+                output_df["source"],
+                output_df["destination"],
+                output_df[value_col.columns],
+            )
     return output_df["source"], output_df["destination"]
diff --git a/python/cugraph/structure/utils.pxd b/python/cugraph/structure/utils.pxd
index 0ec9c914347..350b5890149 100644
--- a/python/cugraph/structure/utils.pxd
+++ b/python/cugraph/structure/utils.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,11 +19,8 @@
 from cugraph.structure.graph_primtypes cimport *
 from libcpp.memory cimport unique_ptr
 
-cdef extern from "raft/handle.hpp" namespace "raft":
-    cdef cppclass handle_t:
-        handle_t() except +
 
-cdef extern from "functions.hpp" namespace "cugraph":
+cdef extern from "cugraph/functions.hpp" namespace "cugraph":
 
     cdef unique_ptr[GraphCSR[VT,ET,WT]] coo_to_csr[VT,ET,WT](
             const GraphCOOView[VT,ET,WT] &graph) except +
diff --git a/python/cugraph/structure/utils_wrapper.pyx b/python/cugraph/structure/utils_wrapper.pyx
index 00af5813056..65c1ca09750 100644
--- a/python/cugraph/structure/utils_wrapper.pyx
+++ b/python/cugraph/structure/utils_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,7 +22,6 @@ from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 
 import cudf
-import rmm
 import numpy as np
 from rmm._lib.device_buffer cimport DeviceBuffer
 from cudf.core.buffer import Buffer
diff --git a/python/cugraph/tests/dask/mg_context.py b/python/cugraph/tests/dask/mg_context.py
index a72cf1c4b04..45dc75767fa 100644
--- a/python/cugraph/tests/dask/mg_context.py
+++ b/python/cugraph/tests/dask/mg_context.py
@@ -12,12 +12,15 @@
 # limitations under the License.
 
 import time
-import os
+
+import pytest
 
 from dask.distributed import Client
+
+from cugraph.dask.common.mg_utils import get_visible_devices
 from dask_cuda import LocalCUDACluster as CUDACluster
 import cugraph.comms as Comms
-import pytest
+
 
 # Maximal number of verifications of the number of workers
 DEFAULT_MAX_ATTEMPT = 100
@@ -26,22 +29,13 @@
 DEFAULT_WAIT_TIME = 0.5
 
 
-def get_visible_devices():
-    _visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
-    if _visible_devices is None:
-        # FIXME: We assume that if the variable is unset there is only one GPU
-        visible_devices = ["0"]
-    else:
-        visible_devices = _visible_devices.strip().split(",")
-    return visible_devices
-
-
 def skip_if_not_enough_devices(required_devices):
-    visible_devices = get_visible_devices()
-    number_of_visible_devices = len(visible_devices)
-    if required_devices > number_of_visible_devices:
-        pytest.skip("Not enough devices available to "
-                    "test MG({})".format(required_devices))
+    if required_devices is not None:
+        visible_devices = get_visible_devices()
+        number_of_visible_devices = len(visible_devices)
+        if required_devices > number_of_visible_devices:
+            pytest.skip("Not enough devices available to "
+                        "test MG({})".format(required_devices))
 
 
 class MGContext:
diff --git a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py
index 4d04bf6df85..02696f589e3 100644
--- a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py
+++ b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,6 +16,7 @@
 
 from cugraph.tests.dask.mg_context import MGContext, skip_if_not_enough_devices
 from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.tests import utils
 
 # Get parameters from standard betwenness_centrality_test
 from cugraph.tests.test_betweenness_centrality import (
@@ -36,8 +37,12 @@
 # =============================================================================
 # Parameters
 # =============================================================================
-DATASETS = ["../datasets/karate.csv"]
-MG_DEVICE_COUNT_OPTIONS = [1, 2, 3, 4]
+DATASETS = [utils.DATASETS_UNDIRECTED[0]]
+MG_DEVICE_COUNT_OPTIONS = [pytest.param(1, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(2, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(3, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(4, marks=pytest.mark.preset_gpu_count),
+                           None]
 RESULT_DTYPE_OPTIONS = [np.float64]
 
 
@@ -46,7 +51,8 @@
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("graph_file", DATASETS,
+                         ids=[f"dataset={d.as_posix()}" for d in DATASETS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
diff --git a/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py
index 1e4a1950c53..89844797807 100644
--- a/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py
+++ b/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -37,14 +37,19 @@
 # Parameters
 # =============================================================================
 DATASETS = ["../datasets/karate.csv"]
-MG_DEVICE_COUNT_OPTIONS = [1, 2, 4]
+MG_DEVICE_COUNT_OPTIONS = [pytest.param(1, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(2, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(3, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(4, marks=pytest.mark.preset_gpu_count),
+                           None]
 RESULT_DTYPE_OPTIONS = [np.float64]
 
 
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("graph_file", DATASETS,
+                         ids=[f"dataset={d}" for d in DATASETS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
diff --git a/python/cugraph/tests/dask/test_mg_bfs.py b/python/cugraph/tests/dask/test_mg_bfs.py
index 553bbc698ff..3e83491c87a 100644
--- a/python/cugraph/tests/dask/test_mg_bfs.py
+++ b/python/cugraph/tests/dask/test_mg_bfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,28 +12,21 @@
 # limitations under the License.
 
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import gc
 import pytest
 import cugraph
 import dask_cudf
 import cudf
-from dask_cuda import LocalCUDACluster
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
@@ -42,7 +35,10 @@ def client_connection():
 def test_dask_bfs(client_connection):
     gc.collect()
 
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
     input_data_path = r"../datasets/netscience.csv"
+    print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
 
     ddf = dask_cudf.read_csv(
@@ -67,9 +63,8 @@ def test_dask_bfs(client_connection):
     dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
     expected_dist = cugraph.bfs(g, 0)
-    result_dist = dcg.bfs(dg, 0, True)
+    result_dist = dcg.bfs(dg, 0, depth_limit=2)
     result_dist = result_dist.compute()
-
     compare_dist = expected_dist.merge(
         result_dist, on="vertex", suffixes=["_local", "_dask"]
     )
@@ -83,3 +78,65 @@ def test_dask_bfs(client_connection):
         ):
             err = err + 1
     assert err == 0
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+def test_dask_bfs_multi_column_depthlimit(client_connection):
+    gc.collect()
+
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
+    input_data_path = r"../datasets/netscience.csv"
+    print(f"dataset={input_data_path}")
+    chunksize = dcg.get_chunksize(input_data_path)
+
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src_a", "dst_a", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+    ddf['src_b'] = ddf['src_a'] + 1000
+    ddf['dst_b'] = ddf['dst_a'] + 1000
+
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src_a", "dst_a", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+    df['src_b'] = df['src_a'] + 1000
+    df['dst_b'] = df['dst_a'] + 1000
+
+    g = cugraph.DiGraph()
+    g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"])
+
+    dg = cugraph.DiGraph()
+    dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"])
+
+    start = cudf.DataFrame()
+    start['a'] = [0]
+    start['b'] = [1000]
+
+    depth_limit = 18
+    expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit)
+    result_dist = dcg.bfs(dg, start, depth_limit=depth_limit)
+    result_dist = result_dist.compute()
+
+    compare_dist = expected_dist.merge(
+        result_dist, on=["0_vertex", "1_vertex"], suffixes=["_local", "_dask"]
+    )
+
+    err = 0
+    for i in range(len(compare_dist)):
+        if (
+            compare_dist["distance_local"].iloc[i] <= depth_limit and
+            compare_dist["distance_dask"].iloc[i] <= depth_limit and
+            compare_dist["distance_local"].iloc[i]
+            != compare_dist["distance_dask"].iloc[i]
+        ):
+            err = err + 1
+    assert err == 0
diff --git a/python/cugraph/tests/dask/test_mg_comms.py b/python/cugraph/tests/dask/test_mg_comms.py
index 29789461018..03a0a5d73d2 100644
--- a/python/cugraph/tests/dask/test_mg_comms.py
+++ b/python/cugraph/tests/dask/test_mg_comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,28 +12,21 @@
 # limitations under the License.
 
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import gc
 import pytest
 import cugraph
 import dask_cudf
 import cudf
-from dask_cuda import LocalCUDACluster
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
@@ -45,10 +38,14 @@ def test_dask_pagerank(client_connection):
     # Initialize and run pagerank on two distributed graphs
     # with same communicator
 
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
     input_data_path1 = r"../datasets/karate.csv"
+    print(f"dataset1={input_data_path1}")
     chunksize1 = dcg.get_chunksize(input_data_path1)
 
     input_data_path2 = r"../datasets/dolphins.csv"
+    print(f"dataset2={input_data_path2}")
     chunksize2 = dcg.get_chunksize(input_data_path2)
 
     ddf1 = dask_cudf.read_csv(
diff --git a/python/cugraph/tests/dask/test_mg_connectivity.py b/python/cugraph/tests/dask/test_mg_connectivity.py
new file mode 100644
index 00000000000..2f946789471
--- /dev/null
+++ b/python/cugraph/tests/dask/test_mg_connectivity.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cugraph.dask as dcg
+import gc
+import pytest
+import cugraph
+import dask_cudf
+import cudf
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
+
+
+@pytest.fixture(scope="module")
+def client_connection():
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
+    yield client
+    teardown_local_dask_cluster(cluster, client)
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+def test_dask_wcc(client_connection):
+    gc.collect()
+
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
+    input_data_path = r"../datasets/netscience.csv"
+    print(f"dataset={input_data_path}")
+    chunksize = dcg.get_chunksize(input_data_path)
+
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    g = cugraph.DiGraph()
+    g.from_cudf_edgelist(df, "src", "dst", renumber=True)
+
+    dg = cugraph.DiGraph()
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
+
+    expected_dist = cugraph.weakly_connected_components(g)
+    result_dist = dcg.weakly_connected_components(dg)
+
+    result_dist = result_dist.compute()
+    compare_dist = expected_dist.merge(
+        result_dist, on="vertex", suffixes=["_local", "_dask"]
+    )
+
+    unique_local_labels = compare_dist['labels_local'].unique()
+
+    for label in unique_local_labels.values.tolist():
+        dask_labels_df = compare_dist[compare_dist['labels_local'] == label]
+        dask_labels = dask_labels_df['labels_dask']
+        assert (dask_labels.iloc[0] == dask_labels).all()
diff --git a/python/cugraph/tests/dask/test_mg_degree.py b/python/cugraph/tests/dask/test_mg_degree.py
index a6600104bc8..bad55df1ca9 100644
--- a/python/cugraph/tests/dask/test_mg_degree.py
+++ b/python/cugraph/tests/dask/test_mg_degree.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,30 +11,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dask.distributed import Client
 import gc
 import pytest
 import cudf
-import cugraph.comms as Comms
 import cugraph
 import dask_cudf
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
-# Move to conftest
-from dask_cuda import LocalCUDACluster
 
-
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
@@ -43,7 +34,10 @@ def client_connection():
 def test_dask_mg_degree(client_connection):
     gc.collect()
 
-    input_data_path = r"../datasets/karate.csv"
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
+    input_data_path = r"../datasets/karate-asymmetric.csv"
+    print(f"dataset={input_data_path}")
 
     chunksize = cugraph.dask.get_chunksize(input_data_path)
 
@@ -68,10 +62,18 @@ def test_dask_mg_degree(client_connection):
     g = cugraph.DiGraph()
     g.from_cudf_edgelist(df, "src", "dst")
 
-    merge_df = (
+    merge_df_in = (
         dg.in_degree()
         .merge(g.in_degree(), on="vertex", suffixes=["_dg", "_g"])
         .compute()
     )
 
-    assert merge_df["degree_dg"].equals(merge_df["degree_g"])
+    merge_df_out = (
+        dg.out_degree()
+        .merge(g.out_degree(), on="vertex", suffixes=["_dg", "_g"])
+        .compute()
+    )
+
+    assert merge_df_in["degree_dg"].equals(merge_df_in["degree_g"])
+    assert merge_df_out["degree_dg"].equals(
+            merge_df_out["degree_g"])
diff --git a/python/cugraph/tests/dask/test_mg_katz_centrality.py b/python/cugraph/tests/dask/test_mg_katz_centrality.py
index 43d63f2fd5d..8ed604954f4 100644
--- a/python/cugraph/tests/dask/test_mg_katz_centrality.py
+++ b/python/cugraph/tests/dask/test_mg_katz_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,30 +14,20 @@
 # import numpy as np
 import pytest
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import gc
 import cugraph
 import dask_cudf
 import cudf
-from dask_cuda import LocalCUDACluster
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
-# The function selects personalization_perc% of accessible vertices in graph M
-# and randomly assigns them personalization values
 
-
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
@@ -46,7 +36,10 @@ def client_connection():
 def test_dask_katz_centrality(client_connection):
     gc.collect()
 
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
     input_data_path = r"../datasets/karate.csv"
+    print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
 
     ddf = dask_cudf.read_csv(
@@ -57,21 +50,12 @@ def test_dask_katz_centrality(client_connection):
         dtype=["int32", "int32", "float32"],
     )
 
-    df = cudf.read_csv(
-        input_data_path,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    g = cugraph.DiGraph()
-    g.from_cudf_edgelist(df, "src", "dst")
-
     dg = cugraph.DiGraph()
     dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
-    largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree")
-    largest_out_degree = largest_out_degree["out_degree"].iloc[0]
+    largest_out_degree = dg.out_degree().compute().\
+        nlargest(n=1, columns="degree")
+    largest_out_degree = largest_out_degree["degree"].iloc[0]
     katz_alpha = 1 / (largest_out_degree + 1)
 
     mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6)
diff --git a/python/cugraph/tests/dask/test_mg_louvain.py b/python/cugraph/tests/dask/test_mg_louvain.py
index 56401e338a4..c67d8fcb1f9 100644
--- a/python/cugraph/tests/dask/test_mg_louvain.py
+++ b/python/cugraph/tests/dask/test_mg_louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,13 +14,12 @@
 import pytest
 
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import cugraph
 import dask_cudf
-from dask_cuda import LocalCUDACluster
 from cugraph.tests import utils
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
 try:
     from rapids_pytest_benchmark import setFixtureParamNames
@@ -44,23 +43,18 @@ def setFixtureParamNames(*args, **kwargs):
 # Fixtures
 @pytest.fixture(scope="module")
 def client_connection():
-    # setup
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    # teardown
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED)
+@pytest.fixture(scope="module",
+                params=utils.DATASETS_UNDIRECTED,
+                ids=[f"dataset={d.as_posix()}"
+                     for d in utils.DATASETS_UNDIRECTED])
 def daskGraphFromDataset(request, client_connection):
     """
     Returns a new dask dataframe created from the dataset file param.
@@ -96,7 +90,7 @@ def test_mg_louvain_with_edgevals(daskGraphFromDataset):
     parts, mod = dcg.louvain(daskGraphFromDataset)
 
     # FIXME: either call Nx with the same dataset and compare results, or
-    # hadcode golden results to compare to.
+    # hardcode golden results to compare to.
     print()
     print(parts.compute())
     print(mod)
diff --git a/python/cugraph/tests/dask/test_mg_pagerank.py b/python/cugraph/tests/dask/test_mg_pagerank.py
index f6416903b89..9cb00010311 100644
--- a/python/cugraph/tests/dask/test_mg_pagerank.py
+++ b/python/cugraph/tests/dask/test_mg_pagerank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,19 +13,18 @@
 import numpy as np
 import pytest
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import gc
 import cugraph
 import dask_cudf
 import cudf
-from dask_cuda import LocalCUDACluster
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
+
 
 # The function selects personalization_perc% of accessible vertices in graph M
 # and randomly assigns them personalization values
 
-
 def personalize(vertices, personalization_perc):
     personalization = None
     if personalization_perc != 0:
@@ -52,17 +51,11 @@ def personalize(vertices, personalization_perc):
 PERSONALIZATION_PERC = [0, 10, 50]
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
@@ -72,7 +65,10 @@ def client_connection():
 def test_dask_pagerank(client_connection, personalization_perc):
     gc.collect()
 
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
     input_data_path = r"../datasets/karate.csv"
+    print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
 
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/tests/dask/test_mg_renumber.py b/python/cugraph/tests/dask/test_mg_renumber.py
index 8456241ff26..de6d1ea4587 100644
--- a/python/cugraph/tests/dask/test_mg_renumber.py
+++ b/python/cugraph/tests/dask/test_mg_renumber.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,36 +20,31 @@
 import numpy as np
 
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import cugraph
 import dask_cudf
 import dask
 import cudf
-from dask_cuda import LocalCUDACluster
 from cugraph.tests import utils
 from cugraph.structure.number_map import NumberMap
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster,
+                                          get_visible_devices)
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in utils.DATASETS_UNRENUMBERED])
 def test_mg_renumber(graph_file, client_connection):
     gc.collect()
 
@@ -65,35 +60,40 @@ def test_mg_renumber(graph_file, client_connection):
     gdf["src"] = sources + translate
     gdf["dst"] = destinations + translate
 
-    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)
-
-    numbering = NumberMap()
-    numbering.from_dataframe(ddf, ["src", "src_old"], ["dst", "dst_old"])
-    renumbered_df = numbering.add_internal_vertex_id(
-        numbering.add_internal_vertex_id(ddf, "src_id", ["src", "src_old"]),
-        "dst_id",
-        ["dst", "dst_old"],
-    )
-
-    check_src = numbering.from_internal_vertex_id(
-        renumbered_df, "src_id"
-    ).compute()
-    check_dst = numbering.from_internal_vertex_id(
-        renumbered_df, "dst_id"
-    ).compute()
+    ddf = dask.dataframe.from_pandas(
+        gdf, npartitions=len(get_visible_devices()))
+
+    # preserve_order is not supported for MG
+    renumbered_df, renumber_map = NumberMap.renumber(ddf,
+                                                     ["src", "src_old"],
+                                                     ["dst", "dst_old"],
+                                                     preserve_order=False)
+    unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src",
+                                              preserve_order=False)
+    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
+                                              preserve_order=False)
+
+    # sort needed only for comparisons, since preserve_order is False
+    gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"])
+    gdf = gdf.reset_index()
+    unrenumbered_df = unrenumbered_df.compute()
+    unrenumbered_df = unrenumbered_df.sort_values(by=["0_src", "1_src",
+                                                      "0_dst", "1_dst"])
+    unrenumbered_df = unrenumbered_df.reset_index()
+
+    assert gdf["src"].equals(unrenumbered_df["0_src"])
+    assert gdf["src_old"].equals(unrenumbered_df["1_src"])
+    assert gdf["dst"].equals(unrenumbered_df["0_dst"])
+    assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
 
-    assert check_src["0"].to_pandas().equals(check_src["src"].to_pandas())
-    assert check_src["1"].to_pandas().equals(check_src["src_old"].to_pandas())
-    assert check_dst["0"].to_pandas().equals(check_dst["dst"].to_pandas())
-    assert check_dst["1"].to_pandas().equals(check_dst["dst_old"].to_pandas())
 
-
-# Test all combinations of default/managed and pooled/non-pooled allocation
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
-def test_mg_renumber2(graph_file, client_connection):
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in utils.DATASETS_UNRENUMBERED])
+def test_mg_renumber_add_internal_vertex_id(graph_file, client_connection):
     gc.collect()
 
     M = utils.read_csv_for_nx(graph_file)
@@ -109,45 +109,8 @@ def test_mg_renumber2(graph_file, client_connection):
     gdf["dst"] = destinations + translate
     gdf["weight"] = gdf.index.astype(np.float)
 
-    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)
-
-    ren2, num2 = NumberMap.renumber(
-        ddf, ["src", "src_old"], ["dst", "dst_old"]
-    )
-
-    check_src = num2.from_internal_vertex_id(ren2, "src").compute()
-    check_src = check_src.sort_values("weight").reset_index(drop=True)
-    check_dst = num2.from_internal_vertex_id(ren2, "dst").compute()
-    check_dst = check_dst.sort_values("weight").reset_index(drop=True)
-
-    assert check_src["0"].to_pandas().equals(gdf["src"].to_pandas())
-    assert check_src["1"].to_pandas().equals(gdf["src_old"].to_pandas())
-    assert check_dst["0"].to_pandas().equals(gdf["dst"].to_pandas())
-    assert check_dst["1"].to_pandas().equals(gdf["dst_old"].to_pandas())
-
-
-# Test all combinations of default/managed and pooled/non-pooled allocation
-@pytest.mark.skipif(
-    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-)
-@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
-def test_mg_renumber3(graph_file, client_connection):
-    gc.collect()
-
-    M = utils.read_csv_for_nx(graph_file)
-    sources = cudf.Series(M["0"])
-    destinations = cudf.Series(M["1"])
-
-    translate = 1000
-
-    gdf = cudf.DataFrame()
-    gdf["src_old"] = sources
-    gdf["dst_old"] = destinations
-    gdf["src"] = sources + translate
-    gdf["dst"] = destinations + translate
-    gdf["weight"] = gdf.index.astype(np.float)
-
-    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)
+    ddf = dask.dataframe.from_pandas(
+        gdf, npartitions=len(get_visible_devices()))
 
     ren2, num2 = NumberMap.renumber(
         ddf, ["src", "src_old"], ["dst", "dst_old"]
@@ -195,9 +158,6 @@ def test_dask_pagerank(client_connection):
     dg = cugraph.DiGraph()
     dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
-    # Pre compute local data
-    # dg.compute_local_data(by='dst')
-
     expected_pr = cugraph.pagerank(g)
     result_pr = dcg.pagerank(dg).compute()
 
diff --git a/python/cugraph/tests/dask/test_mg_replication.py b/python/cugraph/tests/dask/test_mg_replication.py
index 2b8510cd9ff..3974cf9ed82 100644
--- a/python/cugraph/tests/dask/test_mg_replication.py
+++ b/python/cugraph/tests/dask/test_mg_replication.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,14 +24,19 @@
 
 DATASETS_OPTIONS = utils.DATASETS_SMALL
 DIRECTED_GRAPH_OPTIONS = [False, True]
-# MG_DEVICE_COUNT_OPTIONS = [1, 2, 3, 4]
-MG_DEVICE_COUNT_OPTIONS = [1]
+MG_DEVICE_COUNT_OPTIONS = [pytest.param(1, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(2, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(3, marks=pytest.mark.preset_gpu_count),
+                           pytest.param(4, marks=pytest.mark.preset_gpu_count),
+                           None]
 
 
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS)
+@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_replicate_cudf_dataframe_with_weights(
     input_data_path, mg_device_count
@@ -57,7 +62,9 @@ def test_replicate_cudf_dataframe_with_weights(
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS)
+@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_replicate_cudf_dataframe_no_weights(input_data_path, mg_device_count):
     gc.collect()
@@ -81,7 +88,9 @@ def test_replicate_cudf_dataframe_no_weights(input_data_path, mg_device_count):
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS)
+@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_replicate_cudf_series(input_data_path, mg_device_count):
     gc.collect()
@@ -111,7 +120,9 @@ def test_replicate_cudf_series(input_data_path, mg_device_count):
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_no_context(graph_file, directed, mg_device_count):
@@ -126,7 +137,9 @@ def test_enable_batch_no_context(graph_file, directed, mg_device_count):
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_no_context_view_adj(
@@ -142,7 +155,9 @@ def test_enable_batch_no_context_view_adj(
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_context_then_views(
@@ -171,7 +186,9 @@ def test_enable_batch_context_then_views(
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_view_then_context(graph_file, directed, mg_device_count):
@@ -202,7 +219,9 @@ def test_enable_batch_view_then_context(graph_file, directed, mg_device_count):
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_context_no_context_views(
@@ -227,7 +246,9 @@ def test_enable_batch_context_no_context_views(
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_edgelist_replication(
@@ -248,7 +269,9 @@ def test_enable_batch_edgelist_replication(
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_adjlist_replication_weights(
@@ -290,7 +313,9 @@ def test_enable_batch_adjlist_replication_weights(
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
-@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
+@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS,
+                         ids=[f"dataset={d.as_posix()}"
+                              for d in DATASETS_OPTIONS])
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
 def test_enable_batch_adjlist_replication_no_weights(
diff --git a/python/cugraph/tests/dask/test_mg_sssp.py b/python/cugraph/tests/dask/test_mg_sssp.py
index ac4a60f1bdc..9e1fd1ec82f 100644
--- a/python/cugraph/tests/dask/test_mg_sssp.py
+++ b/python/cugraph/tests/dask/test_mg_sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,28 +12,21 @@
 # limitations under the License.
 
 import cugraph.dask as dcg
-import cugraph.comms as Comms
-from dask.distributed import Client
 import gc
 import pytest
 import cugraph
 import dask_cudf
 import cudf
-from dask_cuda import LocalCUDACluster
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
@@ -42,7 +35,10 @@ def client_connection():
 def test_dask_sssp(client_connection):
     gc.collect()
 
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
     input_data_path = r"../datasets/netscience.csv"
+    print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
 
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/tests/dask/test_mg_utility.py b/python/cugraph/tests/dask/test_mg_utility.py
index 808f1bcfa70..150fa0137f5 100644
--- a/python/cugraph/tests/dask/test_mg_utility.py
+++ b/python/cugraph/tests/dask/test_mg_utility.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,16 +12,16 @@
 # limitations under the License.
 
 import cugraph.dask as dcg
-from dask.distributed import Client, default_client, futures_of, wait
+from dask.distributed import default_client, futures_of, wait
 import gc
 import cugraph
 import dask_cudf
-import cugraph.comms as Comms
-from dask_cuda import LocalCUDACluster
 import pytest
 from cugraph.dask.common.part_utils import concat_within_workers
 from cugraph.dask.common.read_utils import get_n_workers
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 import os
 import time
 import numpy as np
@@ -35,24 +35,21 @@ def setup_function():
     gc.collect()
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
 def test_from_edgelist(client_connection):
+    # FIXME: update this to allow dataset to be parameterized and have dataset
+    # part of test param id (see other tests)
     input_data_path = r"../datasets/karate.csv"
+    print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
         input_data_path,
@@ -74,40 +71,6 @@ def test_from_edgelist(client_connection):
     assert dg1.EdgeList == dg2.EdgeList
 
 
-@pytest.mark.skipif(
-    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-)
-def test_compute_local_data(client_connection):
-
-    input_data_path = r"../datasets/karate.csv"
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.DiGraph()
-    dg.from_dask_cudf_edgelist(
-        ddf, source="src", destination="dst", edge_attr="value"
-    )
-
-    # Compute_local_data
-    dg.compute_local_data(by="dst")
-    data = dg.local_data["data"]
-    by = dg.local_data["by"]
-
-    assert by == "dst"
-    assert Comms.is_initialized()
-
-    global_num_edges = data.local_data["edges"].sum()
-    assert global_num_edges == dg.number_of_edges()
-    global_num_verts = data.local_data["verts"].sum()
-    assert global_num_verts == dg.number_of_nodes()
-
-
 @pytest.mark.skipif(
     is_single_gpu(), reason="skipping MG testing on Single GPU system"
 )
diff --git a/python/cugraph/tests/generators/test_rmat.py b/python/cugraph/tests/generators/test_rmat.py
new file mode 100644
index 00000000000..a7c8701095e
--- /dev/null
+++ b/python/cugraph/tests/generators/test_rmat.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+
+import cudf
+import dask_cudf
+
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          get_visible_devices,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
+from cugraph.generators import rmat
+import cugraph
+
+
+##############################################################################
+_cluster = None
+_client = None
+_is_single_gpu = is_single_gpu()
+_visible_devices = get_visible_devices()
+_scale_values = [2, 4, 16]
+_scale_test_ids = [f"scale={x}" for x in _scale_values]
+_mg_values = [False, True]
+_mg_test_ids = [f"mg={x}" for x in _mg_values]
+_graph_types = [cugraph.Graph, cugraph.DiGraph, None, int]
+_graph_test_ids = [f"create_using={getattr(x,'__name__',str(x))}"
+                   for x in _graph_types]
+
+
+def _call_rmat(scale, num_edges, create_using, mg):
+    """
+    Simplifies calling RMAT by requiring only specific args that are varied by
+    these tests and hard-coding all others.
+    """
+    return rmat(scale=scale,
+                num_edges=num_edges,
+                a=0.1,
+                b=0.2,
+                c=0.3,
+                seed=24,
+                clip_and_flip=False,
+                scramble_vertex_ids=True,
+                create_using=create_using,
+                mg=mg)
+
+
+###############################################################################
+def setup_module():
+    global _cluster
+    global _client
+    if not _is_single_gpu:
+        (_cluster, _client) = setup_local_dask_cluster(p2p=True)
+
+
+def teardown_module():
+    if not _is_single_gpu:
+        teardown_local_dask_cluster(_cluster, _client)
+
+
+###############################################################################
+@pytest.mark.parametrize("scale", _scale_values, ids=_scale_test_ids)
+@pytest.mark.parametrize("mg", _mg_values, ids=_mg_test_ids)
+def test_rmat_edgelist(scale, mg):
+    """
+    Verifies that the edgelist returned by rmat() is valid based on inputs.
+    """
+    if mg and _is_single_gpu:
+        pytest.skip("skipping MG testing on Single GPU system")
+
+    num_edges = (2**scale)*4
+    create_using = None  # Returns the edgelist from RMAT
+
+    df = _call_rmat(scale, num_edges, create_using, mg)
+
+    if mg:
+        assert df.npartitions == len(_visible_devices)
+        df_to_check = df.compute()
+    else:
+        df_to_check = df
+
+    assert len(df_to_check) == num_edges
+
+
+@pytest.mark.parametrize("graph_type", _graph_types, ids=_graph_test_ids)
+@pytest.mark.parametrize("mg", _mg_values, ids=_mg_test_ids)
+def test_rmat_return_type(graph_type, mg):
+    """
+    Verifies that the return type returned by rmat() is valid (or the proper
+    exception is raised) based on inputs.
+    """
+    if mg and _is_single_gpu:
+        pytest.skip("skipping MG testing on Single GPU system")
+
+    scale = 2
+    num_edges = (2**scale)*4
+
+    if (mg and (graph_type not in [cugraph.DiGraph, None])) or \
+       (graph_type not in [cugraph.Graph, cugraph.DiGraph, None]):
+
+        with pytest.raises(TypeError):
+            _call_rmat(scale, num_edges, graph_type, mg)
+
+    else:
+        G_or_df = _call_rmat(scale, num_edges, graph_type, mg)
+
+        if graph_type is None:
+            assert type(G_or_df) is dask_cudf.DataFrame if mg \
+                                 else cudf.DataFrame
+        else:
+            assert type(G_or_df) is graph_type
diff --git a/python/cugraph/tests/test_balanced_cut.py b/python/cugraph/tests/test_balanced_cut.py
index f0fc7152e56..2492017511a 100644
--- a/python/cugraph/tests/test_balanced_cut.py
+++ b/python/cugraph/tests/test_balanced_cut.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/tests/test_betweenness_centrality.py b/python/cugraph/tests/test_betweenness_centrality.py
old mode 100644
new mode 100755
index 33b2842645d..ee1a269e532
--- a/python/cugraph/tests/test_betweenness_centrality.py
+++ b/python/cugraph/tests/test_betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.:
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.:
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -68,49 +68,36 @@ def calc_betweenness_centrality(
     edgevals=False,
 ):
     """ Generate both cugraph and networkx betweenness centrality
-
     Parameters
     ----------
     graph_file : string
         Path to COO Graph representation in .csv format
-
     directed : bool, optional, default=True
-
     k : int or None, optional, default=None
         int:  Number of sources  to sample  from
         None: All sources are used to compute
-
     normalized : bool
         True: Normalize Betweenness Centrality scores
         False: Scores are left unnormalized
-
     weight : cudf.DataFrame:
         Not supported as of 06/2020
-
     endpoints : bool
         True: Endpoints are included when computing scores
         False: Endpoints are not considered
-
     seed : int or None, optional, default=None
         Seed for random sampling  of the starting point
-
     result_dtype :  numpy.dtype
         Expected type of the result, either np.float32 or np.float64
-
     use_k_full : bool
         When True, if k is None replaces k by the number of sources of the
         Graph
-
     multi_gpu_batch : bool
         When True, enable mg batch after constructing the graph
-
     edgevals: bool
         When True, enable tests with weighted graph, should be ignored
         during computation.
-
     Returns
     -------
-
     sorted_df : cudf.DataFrame
         Contains 'vertex' and  'cu_bc' 'ref_bc' columns,  where 'cu_bc'
         and 'ref_bc' are the two betweenness centrality scores to compare.
@@ -347,6 +334,7 @@ def test_betweenness_centrality(
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("use_k_full", [True])
 @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
+@pytest.mark.skip(reason="Skipping large tests")
 def test_betweenness_centrality_k_full(
     graph_file,
     directed,
@@ -390,6 +378,7 @@ def test_betweenness_centrality_k_full(
 @pytest.mark.parametrize("subset_seed", [None])
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
+@pytest.mark.skip(reason="Skipping large tests")
 def test_betweenness_centrality_fixed_sample(
     graph_file,
     directed,
@@ -402,7 +391,6 @@ def test_betweenness_centrality_fixed_sample(
     edgevals
 ):
     """Test Betweenness Centrality using a subset
-
     Only k sources are considered for an approximate Betweenness Centrality
     """
     prepare_test()
@@ -429,6 +417,7 @@ def test_betweenness_centrality_fixed_sample(
 @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
+@pytest.mark.skip(reason="Skipping large tests")
 def test_betweenness_centrality_weight_except(
     graph_file,
     directed,
@@ -441,7 +430,6 @@ def test_betweenness_centrality_weight_except(
     edgevals
 ):
     """Calls betwenness_centrality with weight
-
     As of 05/28/2020, weight is not supported and should raise
     a NotImplementedError
     """
diff --git a/python/cugraph/tests/test_bfs.py b/python/cugraph/tests/test_bfs.py
index 0070a34248c..a8547d692c2 100644
--- a/python/cugraph/tests/test_bfs.py
+++ b/python/cugraph/tests/test_bfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -51,6 +51,7 @@
 
 DEFAULT_EPSILON = 1e-6
 
+DEPTH_LIMITS = [None, 1, 5, 18]
 
 # Map of cuGraph input types to the expected output type for cuGraph
 # connected_components calls.
@@ -148,28 +149,14 @@ def compare_single_sp_counter(result, expected, epsilon=DEFAULT_EPSILON):
     return np.isclose(result, expected, rtol=epsilon)
 
 
-def compare_bfs(benchmark_callable, G, nx_values, start_vertex,
-                return_sp_counter=False):
+def compare_bfs(benchmark_callable, G, nx_values, start_vertex, depth_limit):
     """
     Genereate both cugraph and reference bfs traversal.
     """
     if isinstance(start_vertex, int):
-        result = benchmark_callable(cugraph.bfs_edges, G, start_vertex,
-                                    return_sp_counter=return_sp_counter)
+        result = benchmark_callable(cugraph.bfs_edges, G, start_vertex)
         cugraph_df = convert_output_to_cudf(G, result)
-
-        if return_sp_counter:
-            # This call should only contain 3 columns:
-            # 'vertex', 'distance', 'predecessor', 'sp_counter'
-            assert len(cugraph_df.columns) == 4, (
-                "The result of the BFS has an invalid " "number of columns"
-            )
-
-        if return_sp_counter:
-            compare_func = _compare_bfs_spc
-
-        else:
-            compare_func = _compare_bfs
+        compare_func = _compare_bfs
 
         # NOTE: We need to take 2 different path for verification as the nx
         #       functions used as reference return dictionaries that might
@@ -185,18 +172,15 @@ def compare_bfs(benchmark_callable, G, nx_values, start_vertex,
         def func_to_benchmark():
             for sv in start_vertex:
                 cugraph_df = cugraph.bfs_edges(
-                    G, sv, return_sp_counter=return_sp_counter)
+                    G, sv, depth_limit=depth_limit)
                 all_cugraph_distances.append(cugraph_df)
 
         benchmark_callable(func_to_benchmark)
 
-        compare_func = _compare_bfs_spc if return_sp_counter else _compare_bfs
+        compare_func = _compare_bfs
         for (i, sv) in enumerate(start_vertex):
             cugraph_df = convert_output_to_cudf(G, all_cugraph_distances[i])
-            if return_sp_counter:
-                assert len(cugraph_df.columns) == 4, (
-                    "The result of the BFS has an invalid " "number of columns"
-                )
+
             compare_func(cugraph_df, all_nx_values[i], sv)
 
     else:  # Unknown type given to seed
@@ -272,55 +256,6 @@ def _compare_bfs(cugraph_df, nx_distances, source):
     assert invalid_predecessor_error == 0, "There are invalid predecessors"
 
 
-def _compare_bfs_spc(cugraph_df, nx_sp_counter, unused):
-    """
-    Compare BFS with shortest path counters.
-    """
-    sorted_nx = [nx_sp_counter[key] for key in sorted(nx_sp_counter.keys())]
-    # We are not checking for distances / predecessors here as we assume
-    # that these have been checked  in the _compare_bfs tests
-    # We focus solely on shortest path counting
-
-    # cugraph return a dataframe that should contain exactly one time each
-    # vertex
-    # We could us isin to filter only vertices that are common to both
-    # But it would slow down the comparison, and in this specific case
-    # nxacb._single_source_shortest_path_basic is a dictionary containing all
-    # the vertices.
-    # There is no guarantee when we get `df` that the vertices are sorted
-    # thus we enforce the order so that we can leverage faster comparison after
-    sorted_df = cugraph_df.sort_values("vertex").rename(
-        columns={"sp_counter": "cu_spc"}, copy=False
-    )
-
-    # This allows to detect vertices identifier that could have been
-    # wrongly present multiple times
-    cu_vertices = set(sorted_df['vertex'].values_host)
-    nx_vertices = nx_sp_counter.keys()
-    assert len(cu_vertices.intersection(nx_vertices)) == len(
-        nx_vertices
-    ), "There are missing vertices"
-
-    # We add the nx shortest path counter in the cudf.DataFrame, both the
-    # the DataFrame and `sorted_nx` are sorted base on vertices identifiers
-    sorted_df["nx_spc"] = sorted_nx
-
-    # We could use numpy.isclose or cupy.isclose, we can then get the entries
-    # in the cudf.DataFrame where there are is a mismatch.
-    # numpy / cupy allclose would get only a boolean and we might want the
-    # extra information about the discrepancies
-    shortest_path_counter_errors = sorted_df[
-        ~cupy.isclose(
-            sorted_df["cu_spc"], sorted_df["nx_spc"], rtol=DEFAULT_EPSILON
-        )
-    ]
-    if len(shortest_path_counter_errors) > 0:
-        print(shortest_path_counter_errors)
-    assert len(shortest_path_counter_errors) == 0, (
-        "Shortest path counters " "are too different"
-    )
-
-
 def get_nx_graph_and_params(dataset, directed):
     """
     Helper for fixtures returning a Nx graph obj and params.
@@ -329,21 +264,17 @@ def get_nx_graph_and_params(dataset, directed):
             utils.generate_nx_graph_from_file(dataset, directed))
 
 
-def get_nx_results_and_params(seed, use_spc, dataset, directed, Gnx):
+def get_nx_results_and_params(seed, depth_limit, dataset, directed, Gnx):
     """
     Helper for fixtures returning Nx results and params.
     """
     random.seed(seed)
     start_vertex = random.sample(Gnx.nodes(), 1)[0]
 
-    if use_spc:
-        _, _, nx_sp_counter = \
-            nxacb._single_source_shortest_path_basic(Gnx, start_vertex)
-        nx_values = nx_sp_counter
-    else:
-        nx_values = nx.single_source_shortest_path_length(Gnx, start_vertex)
+    nx_values = nx.single_source_shortest_path_length(Gnx, start_vertex,
+                                                      cutoff=depth_limit)
 
-    return (dataset, directed, nx_values, start_vertex, use_spc)
+    return (dataset, directed, nx_values, start_vertex, depth_limit)
 
 
 # =============================================================================
@@ -353,7 +284,7 @@ def get_nx_results_and_params(seed, use_spc, dataset, directed, Gnx):
 DIRECTED = [pytest.param(d) for d in DIRECTED_GRAPH_OPTIONS]
 DATASETS = [pytest.param(d) for d in utils.DATASETS]
 DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL]
-USE_SHORTEST_PATH_COUNTER = [pytest.param(False), pytest.param(True)]
+DEPTH_LIMIT = [pytest.param(d) for d in DEPTH_LIMITS]
 
 # Call genFixtureParamsProduct() to caluculate the cartesian product of
 # multiple lists of params. This is required since parameterized fixtures do
@@ -362,7 +293,7 @@ def get_nx_results_and_params(seed, use_spc, dataset, directed, Gnx):
 # full test name.
 algo_test_fixture_params = utils.genFixtureParamsProduct(
     (SEEDS, "seed"),
-    (USE_SHORTEST_PATH_COUNTER, "spc"))
+    (DEPTH_LIMIT, "depth_limit"))
 
 graph_fixture_params = utils.genFixtureParamsProduct(
     (DATASETS, "ds"),
@@ -377,7 +308,7 @@ def get_nx_results_and_params(seed, use_spc, dataset, directed, Gnx):
 # was covered elsewhere).
 single_algo_test_fixture_params = utils.genFixtureParamsProduct(
     ([SEEDS[0]], "seed"),
-    ([USE_SHORTEST_PATH_COUNTER[0]], "spc"))
+    ([DEPTH_LIMIT[0]], "depth_limit"))
 
 single_small_graph_fixture_params = utils.genFixtureParamsProduct(
     ([DATASETS_SMALL[0]], "ds"),
@@ -446,7 +377,7 @@ def test_bfs(gpubenchmark, dataset_nxresults_startvertex_spc,
     """
     Test BFS traversal on random source with distance and predecessors
     """
-    (dataset, directed, nx_values, start_vertex, use_spc) = \
+    (dataset, directed, nx_values, start_vertex, depth_limit) = \
         dataset_nxresults_startvertex_spc
 
     # special case: ensure cugraph and Nx Graph types are DiGraphs if
@@ -463,8 +394,7 @@ def test_bfs(gpubenchmark, dataset_nxresults_startvertex_spc,
 
     compare_bfs(
         gpubenchmark,
-        G_or_matrix, nx_values, start_vertex, return_sp_counter=use_spc
-    )
+        G_or_matrix, nx_values, start_vertex, depth_limit)
 
 
 @pytest.mark.parametrize("cugraph_input_type",
@@ -477,36 +407,6 @@ def test_bfs_nonnative_inputs(gpubenchmark,
              cugraph_input_type)
 
 
-@pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_INPUT_TYPES)
-def test_bfs_spc_full(gpubenchmark, dataset_nxresults_allstartvertices_spc,
-                      cugraph_input_type):
-    """
-    Test BFS traversal on every vertex with shortest path counting
-    """
-    (dataset, directed, all_nx_values, start_vertices, use_spc) = \
-        dataset_nxresults_allstartvertices_spc
-
-    # use_spc is currently always True
-
-    # special case: ensure cugraph and Nx Graph types are DiGraphs if
-    # "directed" is set, since the graph type parameterization is currently
-    # independent of the directed parameter. Unfortunately this does not
-    # change the "id" in the pytest output.
-    if directed:
-        if cugraph_input_type is cugraph.Graph:
-            cugraph_input_type = cugraph.DiGraph
-        elif cugraph_input_type is nx.Graph:
-            cugraph_input_type = nx.DiGraph
-
-    G_or_matrix = utils.create_obj_from_csv(dataset, cugraph_input_type)
-
-    compare_bfs(
-        gpubenchmark,
-        G_or_matrix, all_nx_values, start_vertex=start_vertices,
-        return_sp_counter=use_spc
-    )
-
-
 def test_scipy_api_compat():
     graph_file = utils.DATASETS[0]
 
@@ -522,7 +422,7 @@ def test_scipy_api_compat():
 
     # Ensure cugraph-compatible options work as expected
     cugraph.bfs(input_cugraph_graph, i_start=0)
-    cugraph.bfs(input_cugraph_graph, i_start=0, return_sp_counter=True)
+    cugraph.bfs(input_cugraph_graph, i_start=0)
     # cannot have start and i_start
     with pytest.raises(TypeError):
         cugraph.bfs(input_cugraph_graph, start=0, i_start=0)
@@ -531,7 +431,6 @@ def test_scipy_api_compat():
     cugraph.bfs(input_coo_matrix, i_start=0)
     cugraph.bfs(input_coo_matrix, i_start=0, directed=True)
     cugraph.bfs(input_coo_matrix, i_start=0, directed=False)
-    result = cugraph.bfs(input_coo_matrix, i_start=0,
-                         return_sp_counter=True)
+    result = cugraph.bfs(input_coo_matrix, i_start=0)
     assert type(result) is tuple
-    assert len(result) == 3
+    assert len(result) == 2
diff --git a/python/cugraph/tests/test_connectivity.py b/python/cugraph/tests/test_connectivity.py
index f957c4b417b..194147ab620 100644
--- a/python/cugraph/tests/test_connectivity.py
+++ b/python/cugraph/tests/test_connectivity.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/tests/test_convert_matrix.py b/python/cugraph/tests/test_convert_matrix.py
index d418dd7ce2e..1dbf51910ea 100644
--- a/python/cugraph/tests/test_convert_matrix.py
+++ b/python/cugraph/tests/test_convert_matrix.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/tests/test_core_number.py b/python/cugraph/tests/test_core_number.py
index edbc7b0597b..9cfc37ba1c5 100644
--- a/python/cugraph/tests/test_core_number.py
+++ b/python/cugraph/tests/test_core_number.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/tests/test_ecg.py b/python/cugraph/tests/test_ecg.py
index 4dc01c389cc..e51ef9b7a98 100644
--- a/python/cugraph/tests/test_ecg.py
+++ b/python/cugraph/tests/test_ecg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,34 +14,39 @@
 import gc
 
 import pytest
-
 import networkx as nx
 import cugraph
+
 from cugraph.tests import utils
 
+from pathlib import PurePath
+
 
 def cugraph_call(G, min_weight, ensemble_size):
     df = cugraph.ecg(G, min_weight, ensemble_size)
     num_parts = df["partition"].max() + 1
-    score = cugraph.analyzeClustering_modularity(G, num_parts, df,
-                                                 'vertex', 'partition')
+    score = cugraph.analyzeClustering_modularity(
+        G, num_parts, df, "vertex", "partition"
+    )
 
     return score, num_parts
 
 
 def golden_call(graph_file):
-    if graph_file == "../datasets/dolphins.csv":
+    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "dolphins.csv":
         return 0.4962422251701355
-    if graph_file == "../datasets/karate.csv":
+    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv":
         return 0.38428664207458496
-    if graph_file == "../datasets/netscience.csv":
+    if (
+        graph_file
+        == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "netscience.csv"
+    ):
         return 0.9279554486274719
 
 
 DATASETS = [
-    "../datasets/karate.csv",
-    "../datasets/dolphins.csv",
-    "../datasets/netscience.csv",
+    PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / f
+    for f in ["karate.csv", "dolphins.csv", "netscience.csv"]
 ]
 
 MIN_WEIGHTS = [0.05, 0.10, 0.15]
@@ -78,9 +83,10 @@ def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size):
     # Read in the graph and get a NetworkX graph
     M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
     G = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight",
-        create_using=nx.Graph()
+        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
     )
 
     # Get the modularity score for partitioning versus random assignment
-    _ = cugraph.ecg(G, min_weight, ensemble_size, "weight")
+    df_dict = cugraph.ecg(G, min_weight, ensemble_size, "weight")
+
+    assert isinstance(df_dict, dict)
diff --git a/python/cugraph/tests/test_edge_betweenness_centrality.py b/python/cugraph/tests/test_edge_betweenness_centrality.py
index 529b0b9de9c..6caad0d9fad 100644
--- a/python/cugraph/tests/test_edge_betweenness_centrality.py
+++ b/python/cugraph/tests/test_edge_betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.:
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.:
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -341,6 +341,7 @@ def test_edge_betweenness_centrality(
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("use_k_full", [True])
 @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
+@pytest.mark.skip(reason="Skipping large tests")
 def test_edge_betweenness_centrality_k_full(
     graph_file,
     directed,
@@ -381,6 +382,7 @@ def test_edge_betweenness_centrality_k_full(
 @pytest.mark.parametrize("subset_seed", [None])
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
+@pytest.mark.skip(reason="Skipping large tests")
 def test_edge_betweenness_centrality_fixed_sample(
     graph_file,
     directed,
@@ -417,6 +419,7 @@ def test_edge_betweenness_centrality_fixed_sample(
 @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
+@pytest.mark.skip(reason="Skipping large tests")
 def test_edge_betweenness_centrality_weight_except(
     graph_file,
     directed,
diff --git a/python/cugraph/tests/test_egonet.py b/python/cugraph/tests/test_egonet.py
new file mode 100644
index 00000000000..fc0ce38eb9c
--- /dev/null
+++ b/python/cugraph/tests/test_egonet.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import pytest
+
+import cudf
+import cugraph
+from cugraph.tests import utils
+
+# Temporarily suppress warnings till networkX fixes deprecation warnings
+# (Using or importing the ABCs from 'collections' instead of from
+# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
+# python 3.7.  Also, this import networkx needs to be relocated in the
+# third-party group once this gets fixed.
+import warnings
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    import networkx as nx
+
+print("Networkx version : {} ".format(nx.__version__))
+
+SEEDS = [0, 5, 13]
+RADIUS = [1, 2, 3]
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("radius", RADIUS)
+def test_ego_graph_nx(graph_file, seed, radius):
+    gc.collect()
+
+    # Nx
+    df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight"
+    )
+    ego_nx = nx.ego_graph(Gnx, seed, radius=radius)
+
+    # cugraph
+    ego_cugraph = cugraph.ego_graph(Gnx, seed, radius=radius)
+
+    assert nx.is_isomorphic(ego_nx, ego_cugraph)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("seeds", [[0, 5, 13]])
+@pytest.mark.parametrize("radius", [1, 2, 3])
+def test_batched_ego_graphs(graph_file, seeds, radius):
+    gc.collect()
+
+    # Nx
+    df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight"
+    )
+
+    # cugraph
+    df, offsets = cugraph.batched_ego_graphs(Gnx, seeds, radius=radius)
+    for i in range(len(seeds)):
+        ego_nx = nx.ego_graph(Gnx, seeds[i], radius=radius)
+        ego_df = df[offsets[i]:offsets[i + 1]]
+        ego_cugraph = nx.from_pandas_edgelist(
+            ego_df, source="src", target="dst", edge_attr="weight"
+        )
+    assert nx.is_isomorphic(ego_nx, ego_cugraph)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("radius", RADIUS)
+def test_multi_column_ego_graph(graph_file, seed, radius):
+    gc.collect()
+
+    df = utils.read_csv_file(graph_file, read_weights_in_sp=True)
+    df.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True)
+    df['src_1'] = df['src_0'] + 1000
+    df['dst_1'] = df['dst_0'] + 1000
+
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(
+        df, source=["src_0", "src_1"], destination=["dst_0", "dst_1"],
+        edge_attr="2"
+    )
+
+    seed_df = cudf.DataFrame()
+    seed_df['v_0'] = [seed]
+    seed_df['v_1'] = [seed + 1000]
+
+    ego_cugraph_res = cugraph.ego_graph(G1, seed_df, radius=radius)
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(
+        df, source="src_0", destination="dst_0",
+        edge_attr="2"
+    )
+    ego_cugraph_exp = cugraph.ego_graph(G2, seed, radius=radius)
+
+    # FIXME: Replace with multi-column view_edge_list()
+    edgelist_df = ego_cugraph_res.edgelist.edgelist_df
+    edgelist_df_res = ego_cugraph_res.unrenumber(edgelist_df, "src")
+    edgelist_df_res = ego_cugraph_res.unrenumber(edgelist_df_res, "dst")
+    for i in range(len(edgelist_df_res)):
+        assert ego_cugraph_exp.has_edge(edgelist_df_res["0_src"].iloc[i],
+                                        edgelist_df_res["0_dst"].iloc[i])
diff --git a/python/cugraph/tests/test_filter_unreachable.py b/python/cugraph/tests/test_filter_unreachable.py
index 29b862f0285..6c00461d234 100644
--- a/python/cugraph/tests/test_filter_unreachable.py
+++ b/python/cugraph/tests/test_filter_unreachable.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/tests/test_force_atlas2.py b/python/cugraph/tests/test_force_atlas2.py
index 4de49cb4088..1128f52904a 100644
--- a/python/cugraph/tests/test_force_atlas2.py
+++ b/python/cugraph/tests/test_force_atlas2.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,14 +12,15 @@
 # limitations under the License.
 
 import time
-
 import pytest
 
+import cudf
 import cugraph
 from cugraph.internals import GraphBasedDimRedCallback
 from cugraph.tests import utils
 from sklearn.manifold import trustworthiness
 import scipy.io
+from pathlib import PurePath
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -61,11 +62,14 @@ def cugraph_call(cu_M, max_iter, pos_list, outbound_attraction_distribution,
 
 
 DATASETS = [
-    ("../datasets/karate.csv", 0.70),
-    ("../datasets/polbooks.csv", 0.75),
-    ("../datasets/dolphins.csv", 0.66),
-    ("../datasets/netscience.csv", 0.66),
+    (PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/f,)+(d,) for (f, d) in [
+        ("karate.csv", 0.70),
+        ("polbooks.csv", 0.75),
+        ("dolphins.csv", 0.66),
+        ("netscience.csv", 0.66)]
 ]
+
+
 MAX_ITERATIONS = [500]
 BARNES_HUT_OPTIMIZE = [False, True]
 
@@ -120,7 +124,7 @@ def test_force_atlas2(graph_file, score, max_iter,
         iterations on a given graph.
     """
 
-    matrix_file = graph_file[:-4] + ".mtx"
+    matrix_file = graph_file.with_suffix(".mtx")
     M = scipy.io.mmread(matrix_file)
     M = M.todense()
     cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
@@ -132,3 +136,70 @@ def test_force_atlas2(graph_file, score, max_iter,
     assert test_callback.on_epoch_end_called_count == max_iter
     # verify `on_train_end` was only called once
     assert test_callback.on_train_end_called_count == 1
+
+
+# FIXME: this test occasionally fails - skipping to prevent CI failures but
+# need to revisit ASAP
+@pytest.mark.skip(reason="non-deterministric - needs fixing!")
+@pytest.mark.parametrize('graph_file, score', DATASETS[:-1])
+@pytest.mark.parametrize('max_iter', MAX_ITERATIONS)
+@pytest.mark.parametrize('barnes_hut_optimize', BARNES_HUT_OPTIMIZE)
+def test_force_atlas2_multi_column_pos_list(graph_file, score, max_iter,
+                                            barnes_hut_optimize):
+    cu_M = utils.read_csv_file(graph_file)
+    test_callback = TestCallback()
+    pos = cugraph_call(cu_M,
+                       max_iter=max_iter,
+                       pos_list=None,
+                       outbound_attraction_distribution=True,
+                       lin_log_mode=False,
+                       prevent_overlapping=False,
+                       edge_weight_influence=1.0,
+                       jitter_tolerance=1.0,
+                       barnes_hut_optimize=False,
+                       barnes_hut_theta=0.5,
+                       scaling_ratio=2.0,
+                       strong_gravity_mode=False,
+                       gravity=1.0,
+                       callback=test_callback)
+
+    cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True)
+    cu_M['src_1'] = cu_M['src_0'] + 1000
+    cu_M['dst_1'] = cu_M['dst_0'] + 1000
+
+    G = cugraph.Graph()
+    G.from_cudf_edgelist(
+        cu_M, source=["src_0", "src_1"],
+        destination=["dst_0", "dst_1"],
+        edge_attr="2"
+    )
+
+    pos_list = cudf.DataFrame()
+    pos_list['vertex_0'] = pos['vertex']
+    pos_list['vertex_1'] = pos_list['vertex_0'] + 1000
+    pos_list['x'] = pos['x']
+    pos_list['y'] = pos['y']
+
+    cu_pos = cugraph.force_atlas2(
+               G,
+               max_iter=max_iter,
+               pos_list=pos_list,
+               outbound_attraction_distribution=True,
+               lin_log_mode=False,
+               prevent_overlapping=False,
+               edge_weight_influence=1.0,
+               jitter_tolerance=1.0,
+               barnes_hut_optimize=False,
+               barnes_hut_theta=0.5,
+               scaling_ratio=2.0,
+               strong_gravity_mode=False,
+               gravity=1.0,
+               callback=test_callback)
+
+    cu_pos = cu_pos.sort_values('0_vertex')
+    matrix_file = graph_file.with_suffix(".mtx")
+    M = scipy.io.mmread(matrix_file)
+    M = M.todense()
+    cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
+    print(cu_trust, score)
+    assert cu_trust > score
diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py
index d8d5a504070..fa68a50c952 100644
--- a/python/cugraph/tests/test_graph.py
+++ b/python/cugraph/tests/test_graph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@
 
 import scipy
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.testing import assert_frame_equal
 import cugraph
 from cugraph.tests import utils
 
@@ -200,6 +200,7 @@ def test_add_adj_list_to_edge_list(graph_file):
     # cugraph add_adj_list to_edge_list call
     G = cugraph.DiGraph()
     G.from_cudf_adjlist(offsets, indices, None)
+
     edgelist = G.view_edge_list()
     sources_cu = edgelist["src"]
     destinations_cu = edgelist["dst"]
@@ -326,7 +327,7 @@ def test_edges_for_Graph(graph_file):
         else:
             edges.append([edge[0], edge[1]])
     nx_edge_list = cudf.DataFrame(list(edges), columns=['src', 'dst'])
-    assert_eq(
+    assert_frame_equal(
         nx_edge_list.sort_values(by=['src', 'dst']).reset_index(drop=True),
         cu_edge_list.sort_values(by=['src', 'dst']).reset_index(drop=True),
         check_dtype=False
@@ -535,6 +536,7 @@ def test_to_directed(graph_file):
     DiG = G.to_directed()
     DiGnx = Gnx.to_directed()
 
+    assert DiG.is_directed()
     assert DiG.number_of_nodes() == DiGnx.number_of_nodes()
     assert DiG.number_of_edges() == DiGnx.number_of_edges()
 
@@ -569,6 +571,7 @@ def test_to_undirected(graph_file):
     G = DiG.to_undirected()
     Gnx = DiGnx.to_undirected()
 
+    assert not G.is_directed()
     assert G.number_of_nodes() == Gnx.number_of_nodes()
     assert G.number_of_edges() == Gnx.number_of_edges()
 
@@ -627,17 +630,13 @@ def test_bipartite_api(graph_file):
     set2_exp = cudf.Series(set(nodes.values_host)
                            - set(set1_exp.values_host))
 
-    G = cugraph.Graph()
-    assert not G.is_bipartite()
+    G = cugraph.BiPartiteGraph()
+    assert G.is_bipartite()
 
     # Add a set of nodes present in one partition
     G.add_nodes_from(set1_exp, bipartite='set1')
     G.from_cudf_edgelist(cu_M, source='0', destination='1')
 
-    # Check if Graph is bipartite. It should return True since we have
-    # added the partition in add_nodes_from()
-    assert G.is_bipartite()
-
     # Call sets() to get the bipartite set of nodes.
     set1, set2 = G.sets()
 
diff --git a/python/cugraph/tests/test_hits.py b/python/cugraph/tests/test_hits.py
index 6b6f54937a6..9229f3734f8 100644
--- a/python/cugraph/tests/test_hits.py
+++ b/python/cugraph/tests/test_hits.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/tests/test_hungarian.py b/python/cugraph/tests/test_hungarian.py
index 280903bc303..4183bcc2c89 100644
--- a/python/cugraph/tests/test_hungarian.py
+++ b/python/cugraph/tests/test_hungarian.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import gc
-from itertools import product
 from timeit import default_timer as timer
 
 import numpy as np
@@ -21,7 +20,6 @@
 import cudf
 import cugraph
 from scipy.optimize import linear_sum_assignment
-import rmm
 
 
 def create_random_bipartite(v1, v2, size, dtype):
@@ -54,33 +52,23 @@ def create_random_bipartite(v1, v2, size, dtype):
     return df1['src'], g, a
 
 
-SPARSE_SIZES = [[5, 5, 100], [500, 500, 10000], [5000, 5000, 100000]]
+SPARSE_SIZES = [[5, 5, 100], [500, 500, 10000]]
+DENSE_SIZES = [[5, 100], [500, 10000]]
 
 
 def setup_function():
     gc.collect()
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
-@pytest.mark.parametrize('managed, pool',
-                         list(product([False, True], [False, True])))
 @pytest.mark.parametrize('v1_size, v2_size, weight_limit', SPARSE_SIZES)
-def test_hungarian(managed, pool, v1_size, v2_size, weight_limit):
-    rmm.reinitialize(
-        managed_memory=managed,
-        pool_allocator=pool,
-        initial_pool_size=2 << 27
-    )
-
-    assert(rmm.is_initialized())
-
+def test_hungarian(v1_size, v2_size, weight_limit):
     v1, g, m = create_random_bipartite(v1_size,
                                        v2_size,
                                        weight_limit,
                                        np.float)
 
     start = timer()
-    matching = cugraph.hungarian(g, v1)
+    cugraph_cost, matching = cugraph.hungarian(g, v1)
     end = timer()
 
     print('cugraph time: ', (end - start))
@@ -93,14 +81,29 @@ def test_hungarian(managed, pool, v1_size, v2_size, weight_limit):
 
     scipy_cost = m[np_matching[0], np_matching[1]].sum()
 
-    cugraph_df = matching.merge(g.edgelist.edgelist_df,
-                                left_on=['vertex', 'assignment'],
-                                right_on=['src', 'dst'],
-                                how='left')
+    assert(scipy_cost == cugraph_cost)
+
 
-    cugraph_cost = cugraph_df['weights'].sum()
+@pytest.mark.parametrize('n, weight_limit', DENSE_SIZES)
+def test_dense_hungarian(n, weight_limit):
+    C = np.random.uniform(
+        0, weight_limit, size=(n, n)
+    ).round().astype(np.float32)
+
+    C_series = cudf.Series(C.flatten())
+
+    start = timer()
+    cugraph_cost, matching = cugraph.dense_hungarian(C_series, n, n)
+    end = timer()
+
+    print('cugraph time: ', (end - start))
+
+    start = timer()
+    np_matching = linear_sum_assignment(C)
+    end = timer()
+
+    print('scipy time: ', (end - start))
 
-    print('scipy_cost = ', scipy_cost)
-    print('cugraph_cost = ', cugraph_cost)
+    scipy_cost = C[np_matching[0], np_matching[1]].sum()
 
     assert(scipy_cost == cugraph_cost)
diff --git a/python/cugraph/tests/test_hypergraph.py b/python/cugraph/tests/test_hypergraph.py
index dbce89905cd..be48168e834 100644
--- a/python/cugraph/tests/test_hypergraph.py
+++ b/python/cugraph/tests/test_hypergraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -34,12 +34,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import cudf
-from cudf.tests.utils import assert_eq
-import cugraph
 import datetime as dt
+
 import pandas as pd
 import pytest
+import cudf
+from cudf.testing.testing import assert_frame_equal
+import cugraph
 
 
 simple_df = cudf.DataFrame.from_pandas(pd.DataFrame({
@@ -107,11 +108,10 @@ def test_hyperedges(categorical_metadata):
     h = cugraph.hypergraph(simple_df,
                            categorical_metadata=categorical_metadata)
 
-    assert_eq(
-        len(h.keys()), len(["entities", "nodes", "edges", "events", "graph"])
-    )
+    assert len(h.keys()) == len(
+        ["entities", "nodes", "edges", "events", "graph"])
 
-    edges = pd.DataFrame({
+    edges = cudf.from_pandas(pd.DataFrame({
         "event_id": [
             "event_id::0",
             "event_id::1",
@@ -158,25 +158,24 @@ def test_hyperedges(categorical_metadata):
         "a1": [1, 2, 3] * 4,
         "a2": ["red", "blue", "green"] * 4,
         "🙈": ["æski ēˈmōjē", "😋", "s"] * 4,
-    })
+    }))
 
     if categorical_metadata:
         edges = edges.astype({"edge_type": "category"})
 
-    assert_eq(edges, h["edges"])
-
+    assert_frame_equal(edges, h["edges"], check_dtype=False)
     for (k, v) in [
         ("entities", 12), ("nodes", 15), ("edges", 12), ("events", 3)
     ]:
-        assert_eq(len(h[k]), v)
+        assert len(h[k]) == v
 
 
 def test_hyperedges_direct():
 
     h = cugraph.hypergraph(hyper_df, direct=True)
 
-    assert_eq(len(h["edges"]), 9)
-    assert_eq(len(h["nodes"]), 9)
+    assert len(h["edges"]) == 9
+    assert len(h["nodes"]) == 9
 
 
 def test_hyperedges_direct_categories():
@@ -191,8 +190,8 @@ def test_hyperedges_direct_categories():
         },
     )
 
-    assert_eq(len(h["edges"]), 9)
-    assert_eq(len(h["nodes"]), 6)
+    assert len(h["edges"]) == 9
+    assert len(h["nodes"]) == 6
 
 
 def test_hyperedges_direct_manual_shaping():
@@ -202,14 +201,14 @@ def test_hyperedges_direct_manual_shaping():
         direct=True,
         EDGES={"aa": ["cc"], "cc": ["cc"]},
     )
-    assert_eq(len(h1["edges"]), 6)
+    assert len(h1["edges"]) == 6
 
     h2 = cugraph.hypergraph(
         hyper_df,
         direct=True,
         EDGES={"aa": ["cc", "bb", "aa"], "cc": ["cc"]},
     )
-    assert_eq(len(h2["edges"]), 12)
+    assert len(h2["edges"]) == 12
 
 
 @pytest.mark.parametrize("categorical_metadata", [False, True])
@@ -220,9 +219,8 @@ def test_drop_edge_attrs(categorical_metadata):
                            drop_edge_attrs=True,
                            categorical_metadata=categorical_metadata)
 
-    assert_eq(
-        len(h.keys()), len(["entities", "nodes", "edges", "events", "graph"])
-    )
+    assert len(h.keys()) == len(
+        ["entities", "nodes", "edges", "events", "graph"])
 
     edges = cudf.DataFrame.from_pandas(pd.DataFrame({
         "event_id": [
@@ -255,12 +253,12 @@ def test_drop_edge_attrs(categorical_metadata):
     if categorical_metadata:
         edges = edges.astype({"edge_type": "category"})
 
-    assert_eq(edges, h["edges"])
+    assert_frame_equal(edges, h["edges"], check_dtype=False)
 
     for (k, v) in [
         ("entities", 9), ("nodes", 12), ("edges", 9), ("events", 3)
     ]:
-        assert_eq(len(h[k]), v)
+        assert len(h[k]) == v
 
 
 @pytest.mark.parametrize("categorical_metadata", [False, True])
@@ -275,9 +273,8 @@ def test_drop_edge_attrs_direct(categorical_metadata):
         categorical_metadata=categorical_metadata,
     )
 
-    assert_eq(
-        len(h.keys()), len(["entities", "nodes", "edges", "events", "graph"])
-    )
+    assert len(h.keys()) == len(
+        ["entities", "nodes", "edges", "events", "graph"])
 
     edges = cudf.DataFrame.from_pandas(pd.DataFrame({
         "event_id": [
@@ -298,10 +295,10 @@ def test_drop_edge_attrs_direct(categorical_metadata):
     if categorical_metadata:
         edges = edges.astype({"edge_type": "category"})
 
-    assert_eq(edges, h["edges"])
+    assert_frame_equal(edges, h["edges"], check_dtype=False)
 
     for (k, v) in [("entities", 9), ("nodes", 9), ("edges", 6), ("events", 0)]:
-        assert_eq(len(h[k]), v)
+        assert len(h[k]) == v
 
 
 def test_skip_hyper():
@@ -397,10 +394,10 @@ def test_skip_na_hyperedge():
         nans_df, drop_edge_attrs=True
     )["edges"]
 
-    assert_eq(len(skip_attr_h_edges), len(expected_hits))
+    assert len(skip_attr_h_edges) == len(expected_hits)
 
     default_h_edges = cugraph.hypergraph(nans_df)["edges"]
-    assert_eq(len(default_h_edges), len(expected_hits))
+    assert len(default_h_edges) == len(expected_hits)
 
 
 def test_hyper_to_pa_vanilla():
diff --git a/python/cugraph/tests/test_jaccard.py b/python/cugraph/tests/test_jaccard.py
index 3c3f6224d83..cc2795cb464 100644
--- a/python/cugraph/tests/test_jaccard.py
+++ b/python/cugraph/tests/test_jaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,11 +13,12 @@
 
 import gc
 import time
-
 import pytest
 
+import cudf
 import cugraph
 from cugraph.tests import utils
+from pathlib import PurePath
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -113,7 +114,9 @@ def test_jaccard(graph_file):
     assert err == 0
 
 
-@pytest.mark.parametrize("graph_file", ["../datasets/netscience.csv"])
+@pytest.mark.parametrize("graph_file", [PurePath(
+    utils.RAPIDS_DATASET_ROOT_DIR)/"netscience.csv"]
+)
 def test_jaccard_edgevals(graph_file):
     gc.collect()
 
@@ -220,3 +223,32 @@ def test_jaccard_nx(graph_file):
     # FIXME:  Nx does a full all-pair Jaccard.
     # cuGraph does a limited 1-hop Jaccard
     # assert nx_j == cg_j
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_jaccard_multi_column(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+
+    cu_M = cudf.DataFrame()
+    cu_M["src_0"] = cudf.Series(M["0"])
+    cu_M["dst_0"] = cudf.Series(M["1"])
+    cu_M["src_1"] = cu_M["src_0"] + 1000
+    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = vertex_pair[:5]
+
+    df_res = cugraph.jaccard(G1, vertex_pair)
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0",
+                          destination="dst_0")
+    df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]])
+
+    # Calculating mismatch
+    assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"])
diff --git a/python/cugraph/tests/test_k_core.py b/python/cugraph/tests/test_k_core.py
index 5e3220dcfb1..d09b719ab79 100644
--- a/python/cugraph/tests/test_k_core.py
+++ b/python/cugraph/tests/test_k_core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -57,7 +57,6 @@ def calc_k_cores(graph_file, directed=True):
 def compare_edges(cg, nxg):
     edgelist_df = cg.view_edge_list()
     src, dest = edgelist_df["src"], edgelist_df["dst"]
-
     assert cg.edgelist.weights is False
     assert len(src) == nxg.size()
     for i in range(len(src)):
@@ -66,7 +65,7 @@ def compare_edges(cg, nxg):
 
 
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
-def test_core_number_Graph(graph_file):
+def test_k_core_Graph(graph_file):
     gc.collect()
 
     cu_kcore, nx_kcore = calc_k_cores(graph_file, False)
@@ -75,7 +74,7 @@ def test_core_number_Graph(graph_file):
 
 
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
-def test_core_number_Graph_nx(graph_file):
+def test_k_core_Graph_nx(graph_file):
     gc.collect()
 
     NM = utils.read_csv_for_nx(graph_file)
@@ -86,3 +85,35 @@ def test_core_number_Graph_nx(graph_file):
     cc = cugraph.k_core(Gnx)
 
     assert nx.is_isomorphic(nc, cc)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_k_core_corenumber_multicolumn(graph_file):
+    gc.collect()
+
+    cu_M = utils.read_csv_file(graph_file)
+    cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True)
+    cu_M['src_1'] = cu_M['src_0'] + 1000
+    cu_M['dst_1'] = cu_M['dst_0'] + 1000
+
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    corenumber_G1 = cugraph.core_number(G1)
+    corenumber_G1.rename(columns={'core_number': 'values'}, inplace=True)
+    corenumber_G1 = corenumber_G1[['0_vertex', '1_vertex', 'values']]
+
+    ck_res = cugraph.k_core(G1, core_number=corenumber_G1)
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0",
+                          destination="dst_0")
+    ck_exp = cugraph.k_core(G2)
+
+    # FIXME: Replace with multi-column view_edge_list()
+    edgelist_df = ck_res.edgelist.edgelist_df
+    edgelist_df_res = ck_res.unrenumber(edgelist_df, "src")
+    edgelist_df_res = ck_res.unrenumber(edgelist_df_res, "dst")
+    for i in range(len(edgelist_df_res)):
+        assert ck_exp.has_edge(edgelist_df_res["0_src"].iloc[i],
+                               edgelist_df_res["0_dst"].iloc[i])
diff --git a/python/cugraph/tests/test_k_truss_subgraph.py b/python/cugraph/tests/test_k_truss_subgraph.py
index e9ccac81cf6..1a1f5c66693 100644
--- a/python/cugraph/tests/test_k_truss_subgraph.py
+++ b/python/cugraph/tests/test_k_truss_subgraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,6 +19,7 @@
 from cugraph.tests import utils
 
 import numpy as np
+from numba import cuda
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -41,7 +42,11 @@
 # currently in networkx master and will hopefully will make it to a release
 # soon.
 def ktruss_ground_truth(graph_file):
-    G = nx.read_edgelist(graph_file, nodetype=int, data=(("weights", float),))
+    G = nx.read_edgelist(
+        str(graph_file),
+        nodetype=int,
+        data=(("weights", float),)
+    )
     df = nx.to_pandas_edgelist(G)
     return df
 
@@ -69,6 +74,31 @@ def compare_k_truss(k_truss_cugraph, k, ground_truth_file):
     return True
 
 
+__cuda_version = cuda.runtime.get_version()
+__unsupported_cuda_version = (11, 4)
+
+
+# FIXME: remove when ktruss is supported on CUDA 11.4
+def test_unsupported_cuda_version():
+    """
+    Ensures the proper exception is raised when ktruss is called in an
+    unsupported env, and not when called in a supported env.
+    """
+    k = 5
+    cu_M = utils.read_csv_file(utils.DATASETS_KTRUSS[0][0])
+    G = cugraph.Graph()
+    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+
+    if __cuda_version == __unsupported_cuda_version:
+        with pytest.raises(NotImplementedError):
+            cugraph.k_truss(G, k)
+    else:
+        cugraph.k_truss(G, k)
+
+
+@pytest.mark.skipif((__cuda_version == __unsupported_cuda_version),
+                    reason="skipping on unsupported CUDA "
+                    f"{__unsupported_cuda_version} environment.")
 @pytest.mark.parametrize("graph_file, nx_ground_truth", utils.DATASETS_KTRUSS)
 def test_ktruss_subgraph_Graph(graph_file, nx_ground_truth):
     gc.collect()
@@ -82,6 +112,9 @@ def test_ktruss_subgraph_Graph(graph_file, nx_ground_truth):
     compare_k_truss(k_subgraph, k, nx_ground_truth)
 
 
+@pytest.mark.skipif((__cuda_version == __unsupported_cuda_version),
+                    reason="skipping on unsupported CUDA "
+                    f"{__unsupported_cuda_version} environment.")
 @pytest.mark.parametrize("graph_file, nx_ground_truth", utils.DATASETS_KTRUSS)
 def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth):
     gc.collect()
@@ -93,9 +126,6 @@ def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth):
         create_using=nx.Graph()
     )
     k_subgraph = cugraph.k_truss(G, k)
-    df = nx.to_pandas_edgelist(k_subgraph)
-
     k_truss_nx = nx.k_truss(G, k)
-    nx_df = nx.to_pandas_edgelist(k_truss_nx)
 
-    assert len(df) == len(nx_df)
+    assert nx.is_isomorphic(k_subgraph, k_truss_nx)
diff --git a/python/cugraph/tests/test_katz_centrality.py b/python/cugraph/tests/test_katz_centrality.py
index a2a03c1518b..ef2f45c08a4 100644
--- a/python/cugraph/tests/test_katz_centrality.py
+++ b/python/cugraph/tests/test_katz_centrality.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +16,7 @@
 
 import pytest
 
+import cudf
 import cugraph
 from cugraph.tests import utils
 
@@ -112,3 +114,38 @@ def test_katz_centrality_nx(graph_file):
             err = err + 1
     print("Mismatches:", err)
     assert err < (0.1 * len(ck))
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_katz_centrality_multi_column(graph_file):
+    gc.collect()
+
+    cu_M = utils.read_csv_file(graph_file)
+    cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True)
+    cu_M['src_1'] = cu_M['src_0'] + 1000
+    cu_M['dst_1'] = cu_M['dst_0'] + 1000
+
+    G1 = cugraph.DiGraph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    G2 = cugraph.DiGraph()
+    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
+
+    k_df_exp = cugraph.katz_centrality(G2, alpha=None, max_iter=1000)
+    k_df_exp = k_df_exp.sort_values("vertex").reset_index(drop=True)
+
+    nstart = cudf.DataFrame()
+    nstart['vertex_0'] = k_df_exp['vertex']
+    nstart['vertex_1'] = nstart['vertex_0'] + 1000
+    nstart['values'] = k_df_exp['katz_centrality']
+
+    k_df_res = cugraph.katz_centrality(G1, nstart=nstart,
+                                       alpha=None, max_iter=1000)
+    k_df_res = k_df_res.sort_values("0_vertex").reset_index(drop=True)
+    k_df_res.rename(columns={'0_vertex': 'vertex'}, inplace=True)
+
+    top_res = topKVertices(k_df_res, "katz_centrality", 10)
+    top_exp = topKVertices(k_df_exp, "katz_centrality", 10)
+
+    assert top_res.equals(top_exp)
diff --git a/python/cugraph/tests/test_leiden.py b/python/cugraph/tests/test_leiden.py
index d6a7f86b5c5..b6c23dad6f2 100644
--- a/python/cugraph/tests/test_leiden.py
+++ b/python/cugraph/tests/test_leiden.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -81,18 +81,13 @@ def test_leiden_nx(graph_file):
     NM = utils.read_csv_for_nx(graph_file)
 
     if edgevals:
-        G = nx.from_pandas_edgelist(NM,
-                                    create_using=nx.Graph(),
-                                    source="0",
-                                    target="1"
-                                    )
+        G = nx.from_pandas_edgelist(
+            NM, create_using=nx.Graph(), source="0", target="1"
+        )
     else:
-        G = nx.from_pandas_edgelist(NM,
-                                    create_using=nx.Graph(),
-                                    source="0",
-                                    target="1",
-                                    edge_attr="2"
-                                    )
+        G = nx.from_pandas_edgelist(
+            NM, create_using=nx.Graph(), source="0", target="1", edge_attr="2"
+        )
 
     leiden_parts, leiden_mod = cugraph_leiden(G, edgevals=True)
     louvain_parts, louvain_mod = cugraph_louvain(G, edgevals=True)
diff --git a/python/cugraph/tests/test_louvain.py b/python/cugraph/tests/test_louvain.py
index d6b0030eb73..fc112b8d657 100644
--- a/python/cugraph/tests/test_louvain.py
+++ b/python/cugraph/tests/test_louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -78,11 +78,12 @@ def test_louvain_with_edgevals(graph_file):
     nx_parts = networkx_call(M)
     # Calculating modularity scores for comparison
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M, source="0", target="1",
+        edge_attr="weight", create_using=nx.Graph()
     )
 
     cu_parts = cu_parts.to_pandas()
-    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+    cu_map = dict(zip(cu_parts["vertex"], cu_parts["partition"]))
 
     assert set(nx_parts.keys()) == set(cu_map.keys())
 
@@ -105,11 +106,12 @@ def test_louvain(graph_file):
 
     # Calculating modularity scores for comparison
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M, source="0", target="1",
+        edge_attr="weight", create_using=nx.Graph()
     )
 
     cu_parts = cu_parts.to_pandas()
-    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+    cu_map = dict(zip(cu_parts["vertex"], cu_parts["partition"]))
 
     assert set(nx_parts.keys()) == set(cu_map.keys())
 
diff --git a/python/cugraph/tests/test_maximum_spanning_tree.py b/python/cugraph/tests/test_maximum_spanning_tree.py
index e20e2f72267..311f28bd6f8 100644
--- a/python/cugraph/tests/test_maximum_spanning_tree.py
+++ b/python/cugraph/tests/test_maximum_spanning_tree.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,16 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
 import gc
 
 import pytest
+import numpy as np
+import rmm
+import cudf
 
 import cugraph
 from cugraph.tests import utils
-import rmm
-import cudf
-import time
-import numpy as np
+
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
diff --git a/python/cugraph/tests/test_minimum_spanning_tree.py b/python/cugraph/tests/test_minimum_spanning_tree.py
index 55ebdcfda08..d1588507bce 100644
--- a/python/cugraph/tests/test_minimum_spanning_tree.py
+++ b/python/cugraph/tests/test_minimum_spanning_tree.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,16 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
 import gc
 
 import pytest
+import numpy as np
+import rmm
+import cudf
 
 import cugraph
 from cugraph.tests import utils
-import rmm
-import cudf
-import time
-import numpy as np
+
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
diff --git a/python/cugraph/tests/test_modularity.py b/python/cugraph/tests/test_modularity.py
index 7a7d42d1592..21b8adae6e6 100644
--- a/python/cugraph/tests/test_modularity.py
+++ b/python/cugraph/tests/test_modularity.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -71,6 +71,43 @@ def test_modularity_clustering(graph_file, partitions):
     assert cu_score > rand_score
 
 
+# Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("partitions", PARTITIONS)
+def test_modularity_clustering_multi_column(graph_file, partitions):
+    gc.collect()
+
+    # Read in the graph and get a cugraph object
+    cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
+    cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True)
+    cu_M['src_1'] = cu_M['src_0'] + 1000
+    cu_M['dst_1'] = cu_M['dst_0'] + 1000
+
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"],
+                          edge_attr="2")
+
+    df1 = cugraph.spectralModularityMaximizationClustering(
+        G1, partitions, num_eigen_vects=(partitions - 1)
+    )
+
+    cu_score = cugraph.analyzeClustering_modularity(G1, partitions, df1,
+                                                    ['0_vertex',
+                                                     '1_vertex'],
+                                                    'cluster')
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0",
+                          destination="dst_0",
+                          edge_attr="2")
+
+    rand_score = random_call(G2, partitions)
+    # Assert that the partitioning has better modularity than the random
+    # assignment
+    assert cu_score > rand_score
+
+
 # Test to ensure DiGraph objs are not accepted
 # Test all combinations of default/managed and pooled/non-pooled allocation
 
diff --git a/python/cugraph/tests/test_multigraph.py b/python/cugraph/tests/test_multigraph.py
new file mode 100644
index 00000000000..57be3eb34e8
--- /dev/null
+++ b/python/cugraph/tests/test_multigraph.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import pytest
+import networkx as nx
+import numpy as np
+
+import cugraph
+from cugraph.tests import utils
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+def setup_function():
+    gc.collect()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_multigraph(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    G = cugraph.MultiDiGraph()
+    G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiDiGraph(),
+    )
+
+    assert G.number_of_edges() == Gnx.number_of_edges()
+    assert G.number_of_nodes() == Gnx.number_of_nodes()
+    cuedges = cugraph.to_pandas_edgelist(G)
+    cuedges.rename(columns={"src": "source", "dst": "target",
+                   "weights": "weight"}, inplace=True)
+    cuedges["weight"] = cuedges["weight"].round(decimals=3)
+    nxedges = nx.to_pandas_edgelist(Gnx).astype(dtype={"source": "int32",
+                                                       "target": "int32",
+                                                       "weight": "float32"})
+    cuedges = cuedges.sort_values(by=["source", "target"]).\
+        reset_index(drop=True)
+    nxedges = nxedges.sort_values(by=["source", "target"]).\
+        reset_index(drop=True)
+    nxedges["weight"] = nxedges["weight"].round(decimals=3)
+    assert nxedges.equals(cuedges[["source", "target", "weight"]])
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_Graph_from_MultiGraph(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    GM = cugraph.MultiGraph()
+    GM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    GnxM = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiGraph(),
+    )
+
+    G = cugraph.Graph(GM)
+    Gnx = nx.Graph(GnxM)
+    assert Gnx.number_of_edges() == G.number_of_edges()
+
+    GdM = cugraph.MultiDiGraph()
+    GdM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    GnxdM = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiGraph(),
+    )
+    Gd = cugraph.DiGraph(GdM)
+    Gnxd = nx.DiGraph(GnxdM)
+    assert Gnxd.number_of_edges() == Gd.number_of_edges()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_multigraph_sssp(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    G = cugraph.MultiDiGraph()
+    G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    cu_paths = cugraph.sssp(G, 0)
+    max_val = np.finfo(cu_paths["distance"].dtype).max
+    cu_paths = cu_paths[cu_paths["distance"] != max_val]
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiDiGraph(),
+    )
+    nx_paths = nx.single_source_dijkstra_path_length(Gnx, 0)
+
+    cu_dist = cu_paths.sort_values(by='vertex')['distance'].to_array()
+    nx_dist = [i[1] for i in sorted(nx_paths.items())]
+
+    assert (cu_dist == nx_dist).all()
diff --git a/python/cugraph/tests/test_nx_convert.py b/python/cugraph/tests/test_nx_convert.py
index 08a96a801e2..98cc8a11dc7 100644
--- a/python/cugraph/tests/test_nx_convert.py
+++ b/python/cugraph/tests/test_nx_convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,11 +12,14 @@
 # limitations under the License.
 
 import gc
+
 import pytest
 import cudf
+
 import cugraph
 from cugraph.tests import utils
 
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
@@ -77,7 +80,6 @@ def test_networkx_compatibility(graph_file):
     _compare_graphs(nxG, cuG)
 
 
-# Test
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_nx_convert(graph_file):
     gc.collect()
diff --git a/python/cugraph/tests/test_overlap.py b/python/cugraph/tests/test_overlap.py
index 53d279478f7..42bc3ea9808 100644
--- a/python/cugraph/tests/test_overlap.py
+++ b/python/cugraph/tests/test_overlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,6 +17,8 @@
 import pytest
 import numpy as np
 import scipy
+
+import cudf
 import cugraph
 from cugraph.tests import utils
 
@@ -147,3 +149,32 @@ def test_overlap_edge_vals(graph_file):
         else:
             diff = abs(cpu_coeff[i] - cu_coeff[i])
             assert diff < 1.0e-6
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_overlap_multi_column(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+
+    cu_M = cudf.DataFrame()
+    cu_M["src_0"] = cudf.Series(M["0"])
+    cu_M["dst_0"] = cudf.Series(M["1"])
+    cu_M["src_1"] = cu_M["src_0"] + 1000
+    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = vertex_pair[:5]
+
+    df_res = cugraph.overlap(G1, vertex_pair)
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0",
+                          destination="dst_0")
+    df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]])
+
+    # Calculating mismatch
+    assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"])
diff --git a/python/cugraph/tests/test_pagerank.py b/python/cugraph/tests/test_pagerank.py
index 1ab370041b5..50be1cd5230 100644
--- a/python/cugraph/tests/test_pagerank.py
+++ b/python/cugraph/tests/test_pagerank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,6 +21,7 @@
 import cugraph
 from cugraph.tests import utils
 
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
@@ -143,7 +144,7 @@ def networkx_call(Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx):
 #
 # https://github.com/rapidsai/cugraph/issues/533
 #
-# @pytest.mark.parametrize("graph_file", utils.DATASETS)
+
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 @pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
 @pytest.mark.parametrize("tol", TOLERANCE)
@@ -159,7 +160,8 @@ def test_pagerank(
     M = utils.read_csv_for_nx(graph_file)
     nnz_vtx = np.unique(M[['0', '1']])
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", create_using=nx.DiGraph()
+        M, source="0", target="1", edge_attr="weight",
+        create_using=nx.DiGraph()
     )
 
     networkx_pr, networkx_prsn = networkx_call(
@@ -169,13 +171,13 @@ def test_pagerank(
     cu_nstart = None
     if has_guess == 1:
         cu_nstart = cudify(networkx_pr)
-        max_iter = 5
+        max_iter = 20
     cu_prsn = cudify(networkx_prsn)
 
     # cuGraph PageRank
     cu_M = utils.read_csv_file(graph_file)
     G = cugraph.DiGraph()
-    G.from_cudf_edgelist(cu_M, source="0", destination="1")
+    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
 
     cugraph_pr = cugraph_call(G, max_iter, tol, alpha, cu_prsn, cu_nstart)
 
@@ -218,7 +220,7 @@ def test_pagerank_nx(
     cu_nstart = None
     if has_guess == 1:
         cu_nstart = cudify(networkx_pr)
-        max_iter = 5
+        max_iter = 20
     cu_prsn = cudify(networkx_prsn)
 
     # cuGraph PageRank with Nx Graph
@@ -239,3 +241,88 @@ def test_pagerank_nx(
             print(f"{cugraph_pr[i][1]} and {cugraph_pr[i][1]}")
     print("Mismatches:", err)
     assert err < (0.01 * len(cugraph_pr))
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
+@pytest.mark.parametrize("tol", TOLERANCE)
+@pytest.mark.parametrize("alpha", ALPHA)
+@pytest.mark.parametrize("personalization_perc", PERSONALIZATION_PERC)
+@pytest.mark.parametrize("has_guess", HAS_GUESS)
+def test_pagerank_multi_column(
+    graph_file, max_iter, tol, alpha, personalization_perc, has_guess
+):
+    gc.collect()
+
+    # NetworkX PageRank
+    M = utils.read_csv_for_nx(graph_file)
+    nnz_vtx = np.unique(M[['0', '1']])
+
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", edge_attr="weight",
+        create_using=nx.DiGraph()
+    )
+
+    networkx_pr, networkx_prsn = networkx_call(
+        Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx
+    )
+
+    cu_nstart = None
+    if has_guess == 1:
+        cu_nstart_temp = cudify(networkx_pr)
+        max_iter = 100
+        cu_nstart = cudf.DataFrame()
+        cu_nstart["vertex_0"] = cu_nstart_temp["vertex"]
+        cu_nstart["vertex_1"] = cu_nstart["vertex_0"] + 1000
+        cu_nstart["values"] = cu_nstart_temp["values"]
+
+    cu_prsn_temp = cudify(networkx_prsn)
+    if cu_prsn_temp is not None:
+        cu_prsn = cudf.DataFrame()
+        cu_prsn["vertex_0"] = cu_prsn_temp["vertex"]
+        cu_prsn["vertex_1"] = cu_prsn["vertex_0"] + 1000
+        cu_prsn["values"] = cu_prsn_temp["values"]
+    else:
+        cu_prsn = cu_prsn_temp
+
+    cu_M = cudf.DataFrame()
+    cu_M["src_0"] = cudf.Series(M["0"])
+    cu_M["dst_0"] = cudf.Series(M["1"])
+    cu_M["src_1"] = cu_M["src_0"] + 1000
+    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M["weights"] = cudf.Series(M["weight"])
+
+    cu_G = cugraph.DiGraph()
+    cu_G.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                            destination=["dst_0", "dst_1"],
+                            edge_attr="weights")
+
+    df = cugraph.pagerank(
+        cu_G,
+        alpha=alpha,
+        max_iter=max_iter,
+        tol=tol,
+        personalization=cu_prsn,
+        nstart=cu_nstart,
+    )
+
+    cugraph_pr = []
+
+    df = df.sort_values("0_vertex").reset_index(drop=True)
+
+    pr_scores = df["pagerank"].to_array()
+    for i, rank in enumerate(pr_scores):
+        cugraph_pr.append((i, rank))
+
+    # Calculating mismatch
+    networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0])
+    err = 0
+    assert len(cugraph_pr) == len(networkx_pr)
+    for i in range(len(cugraph_pr)):
+        if (
+            abs(cugraph_pr[i][1] - networkx_pr[i][1]) > tol * 1.1
+            and cugraph_pr[i][0] == networkx_pr[i][0]
+        ):
+            err = err + 1
+    print("Mismatches:", err)
+    assert err < (0.01 * len(cugraph_pr))
diff --git a/python/cugraph/tests/test_paths.py b/python/cugraph/tests/test_paths.py
index 7467d024051..56cc9b3cd50 100644
--- a/python/cugraph/tests/test_paths.py
+++ b/python/cugraph/tests/test_paths.py
@@ -1,11 +1,27 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from tempfile import NamedTemporaryFile
+
 import cudf
-import cugraph
 from cupy.sparse import coo_matrix as cupy_coo_matrix
 import cupy
 import networkx as nx
 import pytest
-import sys
-from tempfile import NamedTemporaryFile
+
+import cugraph
+
 
 CONNECTED_GRAPH = """1,5,3
 1,4,1
diff --git a/python/cugraph/tests/test_random_walks.py b/python/cugraph/tests/test_random_walks.py
new file mode 100644
index 00000000000..302a93cd02a
--- /dev/null
+++ b/python/cugraph/tests/test_random_walks.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import pytest
+
+from cugraph.tests import utils
+import cugraph
+import random
+
+
+# =============================================================================
+# Parameters
+# =============================================================================
+DIRECTED_GRAPH_OPTIONS = [False, True]
+WEIGHTED_GRAPH_OPTIONS = [False, True]
+DATASETS = [pytest.param(d) for d in utils.DATASETS]
+DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL]
+
+
+def calc_random_walks(graph_file,
+                      directed=False,
+                      max_depth=None,
+                      use_padding=False):
+    """
+    compute random walks for each nodes in 'start_vertices'
+
+    parameters
+    ----------
+    G : cuGraph.Graph or networkx.Graph
+        The graph can be either directed (DiGraph) or undirected (Graph).
+        Weights in the graph are ignored.
+        Use weight parameter if weights need to be considered
+        (currently not supported)
+
+    start_vertices : int or list or cudf.Series
+        A single node or a list or a cudf.Series of nodes from which to run
+        the random walks
+
+    max_depth : int
+        The maximum depth of the random walks
+
+    use_padding : bool
+        If True, padded paths are returned else coalesced paths are returned.
+
+    Returns
+    -------
+    vertex_paths : cudf.Series or cudf.DataFrame
+        Series containing the vertices of edges/paths in the random walk.
+
+    edge_weight_paths: cudf.Series
+        Series containing the edge weights of edges represented by the
+        returned vertex_paths
+
+    sizes: int
+        The path size in case of coalesced paths.
+    """
+    G = utils.generate_cugraph_graph_from_file(
+        graph_file, directed=directed, edgevals=True)
+    assert G is not None
+
+    k = random.randint(1, 10)
+    start_vertices = random.sample(range(G.number_of_vertices()), k)
+    vertex_paths, edge_weights, vertex_path_sizes = cugraph.random_walks(
+            G, start_vertices, max_depth, use_padding)
+
+    return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices
+
+
+def check_random_walks(path_data, seeds, df_G=None):
+    invalid_edge = 0
+    invalid_seeds = 0
+    offsets_idx = 0
+    next_path_idx = 0
+    v_paths = path_data[0]
+    sizes = path_data[2].to_array().tolist()
+
+    for s in sizes:
+        for i in range(next_path_idx, next_path_idx+s-1):
+            src, dst = v_paths.iloc[i],  v_paths.iloc[i+1]
+            if i == next_path_idx and src != seeds[offsets_idx]:
+                invalid_seeds += 1
+                print(
+                        "[ERR] Invalid seed: "
+                        " src {} != src {}"
+                        .format(src, seeds[offsets_idx])
+                    )
+        offsets_idx += 1
+        next_path_idx += s
+
+        exp_edge = df_G.loc[
+            (df_G['src'] == (src)) & (
+                df_G['dst'] == (dst))].reset_index(drop=True)
+
+        if not (exp_edge['src'].loc[0], exp_edge['dst'].loc[0]) == (src, dst):
+            print(
+                    "[ERR] Invalid edge: "
+                    "There is no edge src {} dst {}"
+                    .format(src, dst)
+                )
+            invalid_edge += 1
+
+    assert invalid_edge == 0
+    assert invalid_seeds == 0
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+
+
+def prepare_test():
+    gc.collect()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+@pytest.mark.parametrize("max_depth", [None])
+def test_random_walks_invalid_max_dept(graph_file,
+                                       directed,
+                                       max_depth):
+    prepare_test()
+    with pytest.raises(TypeError):
+        df, offsets, seeds = calc_random_walks(
+            graph_file,
+            directed=directed,
+            max_depth=max_depth
+        )
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+def test_random_walks_coalesced(
+    graph_file,
+    directed
+):
+    max_depth = random.randint(2, 10)
+    df_G = utils.read_csv_file(graph_file)
+    df_G.rename(
+        columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True)
+    path_data, seeds = calc_random_walks(
+        graph_file,
+        directed,
+        max_depth=max_depth
+    )
+    check_random_walks(path_data, seeds, df_G)
+
+    # Check path query output
+    df = cugraph.rw_path(len(seeds), path_data[2])
+    v_offsets = [0] + path_data[2].cumsum()[:-1].to_array().tolist()
+    w_offsets = [0] + (path_data[2]-1).cumsum()[:-1].to_array().tolist()
+
+    assert df['weight_sizes'].equals(path_data[2]-1)
+    assert df['vertex_offsets'].to_array().tolist() == v_offsets
+    assert df['weight_offsets'].to_array().tolist() == w_offsets
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+def test_random_walks_padded(
+    graph_file,
+    directed
+):
+    max_depth = random.randint(2, 10)
+    df_G = utils.read_csv_file(graph_file)
+    df_G.rename(
+        columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True)
+    path_data, seeds = calc_random_walks(
+        graph_file,
+        directed,
+        max_depth=max_depth,
+        use_padding=True
+    )
+    v_paths = path_data[0]
+    e_weights = path_data[1]
+    assert len(v_paths) == max_depth*len(seeds)
+    assert len(e_weights) == (max_depth - 1)*len(seeds)
+
+
+"""@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+def test_random_walks(
+    graph_file,
+    directed
+):
+    max_depth = random.randint(2, 10)
+    df_G = utils.read_csv_file(graph_file)
+    df_G.rename(
+        columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True)
+    df_G['src_0'] = df_G['src'] + 1000
+    df_G['dst_0'] = df_G['dst'] + 1000
+
+    if directed:
+        G = cugraph.DiGraph()
+    else:
+        G = cugraph.Graph()
+    G.from_cudf_edgelist(df_G, source=['src', 'src_0'],
+                         destination=['dst', 'dst_0'],
+                         edge_attr="weight")
+
+    k = random.randint(1, 10)
+    start_vertices = random.sample(G.nodes().to_array().tolist(), k)
+
+    seeds = cudf.DataFrame()
+    seeds['v'] = start_vertices
+    seeds['v_0'] = seeds['v'] + 1000
+
+    df, offsets = cugraph.random_walks(G, seeds, max_depth)
+
+    check_random_walks(df, offsets, seeds, df_G)
+"""
diff --git a/python/cugraph/tests/test_renumber.py b/python/cugraph/tests/test_renumber.py
index 91416942429..129bd667621 100644
--- a/python/cugraph/tests/test_renumber.py
+++ b/python/cugraph/tests/test_renumber.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,8 +17,8 @@
 
 import pandas as pd
 import pytest
-
 import cudf
+
 from cugraph.structure.number_map import NumberMap
 from cugraph.tests import utils
 
@@ -44,13 +44,14 @@ def test_renumber_ips():
     gdf["source_as_int"] = gdf["source_list"].str.ip2int()
     gdf["dest_as_int"] = gdf["dest_list"].str.ip2int()
 
-    numbering = NumberMap()
-    numbering.from_series(gdf["source_as_int"], gdf["dest_as_int"])
-    src = numbering.to_internal_vertex_id(gdf["source_as_int"])
-    dst = numbering.to_internal_vertex_id(gdf["dest_as_int"])
+    renumbered_gdf, renumber_map = NumberMap.renumber(gdf,
+                                                      "source_as_int",
+                                                      "dest_as_int")
 
-    check_src = numbering.from_internal_vertex_id(src)["0"]
-    check_dst = numbering.from_internal_vertex_id(dst)["0"]
+    check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src']
+                                                     )["0"]
+    check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
+                                                     )["0"]
 
     assert check_src.equals(gdf["source_as_int"])
     assert check_dst.equals(gdf["dest_as_int"])
@@ -78,13 +79,14 @@ def test_renumber_ips_cols():
     gdf["source_as_int"] = gdf["source_list"].str.ip2int()
     gdf["dest_as_int"] = gdf["dest_list"].str.ip2int()
 
-    numbering = NumberMap()
-    numbering.from_dataframe(gdf, ["source_as_int"], ["dest_as_int"])
-    src = numbering.to_internal_vertex_id(gdf["source_as_int"])
-    dst = numbering.to_internal_vertex_id(gdf["dest_as_int"])
+    renumbered_gdf, renumber_map = NumberMap.renumber(gdf,
+                                                      ["source_as_int"],
+                                                      ["dest_as_int"])
 
-    check_src = numbering.from_internal_vertex_id(src)["0"]
-    check_dst = numbering.from_internal_vertex_id(dst)["0"]
+    check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src']
+                                                     )["0"]
+    check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
+                                                     )["0"]
 
     assert check_src.equals(gdf["source_as_int"])
     assert check_dst.equals(gdf["dest_as_int"])
@@ -110,13 +112,14 @@ def test_renumber_ips_str_cols():
 
     gdf = cudf.from_pandas(pdf)
 
-    numbering = NumberMap()
-    numbering.from_dataframe(gdf, ["source_list"], ["dest_list"])
-    src = numbering.to_internal_vertex_id(gdf["source_list"])
-    dst = numbering.to_internal_vertex_id(gdf["dest_list"])
+    renumbered_gdf, renumber_map = NumberMap.renumber(gdf,
+                                                      ["source_as_int"],
+                                                      ["dest_as_int"])
 
-    check_src = numbering.from_internal_vertex_id(src)["0"]
-    check_dst = numbering.from_internal_vertex_id(dst)["0"]
+    check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src']
+                                                     )["0"]
+    check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
+                                                     )["0"]
 
     assert check_src.equals(gdf["source_list"])
     assert check_dst.equals(gdf["dest_list"])
@@ -130,13 +133,14 @@ def test_renumber_negative():
 
     gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]])
 
-    numbering = NumberMap()
-    numbering.from_dataframe(gdf, ["source_list"], ["dest_list"])
-    src = numbering.to_internal_vertex_id(gdf["source_list"])
-    dst = numbering.to_internal_vertex_id(gdf["dest_list"])
+    renumbered_gdf, renumber_map = NumberMap.renumber(gdf,
+                                                      "source_list",
+                                                      "dest_list")
 
-    check_src = numbering.from_internal_vertex_id(src)["0"]
-    check_dst = numbering.from_internal_vertex_id(dst)["0"]
+    check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src']
+                                                     )["0"]
+    check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
+                                                     )["0"]
 
     assert check_src.equals(gdf["source_list"])
     assert check_dst.equals(gdf["dest_list"])
@@ -150,19 +154,54 @@ def test_renumber_negative_col():
 
     gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]])
 
-    numbering = NumberMap()
-    numbering.from_dataframe(gdf, ["source_list"], ["dest_list"])
-    src = numbering.to_internal_vertex_id(gdf["source_list"])
-    dst = numbering.to_internal_vertex_id(gdf["dest_list"])
+    renumbered_gdf, renumber_map = NumberMap.renumber(gdf,
+                                                      "source_list",
+                                                      "dest_list")
 
-    check_src = numbering.from_internal_vertex_id(src)["0"]
-    check_dst = numbering.from_internal_vertex_id(dst)["0"]
+    check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src']
+                                                     )["0"]
+    check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst']
+                                                     )["0"]
 
     assert check_src.equals(gdf["source_list"])
     assert check_dst.equals(gdf["dest_list"])
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.skip(reason="dropped renumbering from series support")
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_renumber_series(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    sources = cudf.Series(M["0"])
+    destinations = cudf.Series(M["1"])
+
+    translate = 1000
+
+    df = cudf.DataFrame()
+    df["src"] = cudf.Series([x + translate for x in sources.
+                            values_host])
+    df["dst"] = cudf.Series([x + translate for x in destinations.
+                            values_host])
+
+    numbering_series_1 = NumberMap()
+    numbering_series_1.from_series(df["src"])
+
+    numbering_series_2 = NumberMap()
+    numbering_series_2.from_series(df["dst"])
+
+    renumbered_src = numbering_series_1.add_internal_vertex_id(
+        df["src"], "src_id")
+    renumbered_dst = numbering_series_2.add_internal_vertex_id(
+        df["dst"], "dst_id")
+
+    check_src = numbering_series_1.from_internal_vertex_id(renumbered_src,
+                                                           "src_id")
+    check_dst = numbering_series_2.from_internal_vertex_id(renumbered_dst,
+                                                           "dst_id")
+
+    assert check_src["0_y"].equals(check_src["0_x"])
+    assert check_dst["0_y"].equals(check_dst["0_x"])
 
 
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
@@ -181,22 +220,23 @@ def test_renumber_files(graph_file):
     df["dst"] = cudf.Series([x + translate for x in destinations.
                             values_host])
 
-    numbering = NumberMap()
-    numbering.from_series(df["src"], df["dst"])
+    exp_src = cudf.Series([x + translate for x in sources.
+                          values_host])
+    exp_dst = cudf.Series([x + translate for x in destinations.
+                          values_host])
 
-    renumbered_df = numbering.add_internal_vertex_id(
-        numbering.add_internal_vertex_id(df, "src_id", ["src"]),
-        "dst_id", ["dst"]
-    )
+    renumbered_df, renumber_map = NumberMap.renumber(df, "src", "dst",
+                                                     preserve_order=True)
 
-    check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id")
-    check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id")
+    unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src",
+                                              preserve_order=True)
+    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
+                                              preserve_order=True)
 
-    assert check_src["src"].equals(check_src["0"])
-    assert check_dst["dst"].equals(check_dst["0"])
+    assert exp_src.equals(unrenumbered_df["src"])
+    assert exp_dst.equals(unrenumbered_df["dst"])
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_renumber_files_col(graph_file):
     gc.collect()
@@ -212,22 +252,23 @@ def test_renumber_files_col(graph_file):
     gdf['dst'] = cudf.Series([x + translate for x in destinations.
                              values_host])
 
-    numbering = NumberMap()
-    numbering.from_dataframe(gdf, ["src"], ["dst"])
+    exp_src = cudf.Series([x + translate for x in sources.
+                          values_host])
+    exp_dst = cudf.Series([x + translate for x in destinations.
+                          values_host])
 
-    renumbered_df = numbering.add_internal_vertex_id(
-        numbering.add_internal_vertex_id(gdf, "src_id", ["src"]),
-        "dst_id", ["dst"]
-    )
+    renumbered_df, renumber_map = NumberMap.renumber(gdf, ["src"], ["dst"],
+                                                     preserve_order=True)
 
-    check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id")
-    check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id")
+    unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src",
+                                              preserve_order=True)
+    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
+                                              preserve_order=True)
 
-    assert check_src["src"].equals(check_src["0"])
-    assert check_dst["dst"].equals(check_dst["0"])
+    assert exp_src.equals(unrenumbered_df["src"])
+    assert exp_dst.equals(unrenumbered_df["dst"])
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_renumber_files_multi_col(graph_file):
     gc.collect()
@@ -244,21 +285,17 @@ def test_renumber_files_multi_col(graph_file):
     gdf["src"] = sources + translate
     gdf["dst"] = destinations + translate
 
-    numbering = NumberMap()
-    numbering.from_dataframe(gdf, ["src", "src_old"], ["dst", "dst_old"])
-
-    renumbered_df = numbering.add_internal_vertex_id(
-        numbering.add_internal_vertex_id(
-            gdf, "src_id", ["src", "src_old"]
-        ),
-        "dst_id",
-        ["dst", "dst_old"],
-    )
+    renumbered_df, renumber_map = NumberMap.renumber(gdf,
+                                                     ["src", "src_old"],
+                                                     ["dst", "dst_old"],
+                                                     preserve_order=True)
 
-    check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id")
-    check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id")
+    unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src",
+                                              preserve_order=True)
+    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst",
+                                              preserve_order=True)
 
-    assert check_src["src"].equals(check_src["0"])
-    assert check_src["src_old"].equals(check_src["1"])
-    assert check_dst["dst"].equals(check_dst["0"])
-    assert check_dst["dst_old"].equals(check_dst["1"])
+    assert gdf["src"].equals(unrenumbered_df["0_src"])
+    assert gdf["src_old"].equals(unrenumbered_df["1_src"])
+    assert gdf["dst"].equals(unrenumbered_df["0_dst"])
+    assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
diff --git a/python/cugraph/tests/test_sssp.py b/python/cugraph/tests/test_sssp.py
index 0a5347a6290..9230b7a7b96 100644
--- a/python/cugraph/tests/test_sssp.py
+++ b/python/cugraph/tests/test_sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -29,6 +29,7 @@
 import cugraph
 from cugraph.tests import utils
 
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
diff --git a/python/cugraph/tests/test_subgraph_extraction.py b/python/cugraph/tests/test_subgraph_extraction.py
index a4f36af994a..389a7716e48 100644
--- a/python/cugraph/tests/test_subgraph_extraction.py
+++ b/python/cugraph/tests/test_subgraph_extraction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,6 +20,7 @@
 import cugraph
 from cugraph.tests import utils
 
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
@@ -70,7 +71,6 @@ def nx_call(M, verts, directed=True):
     return nx.subgraph(G, verts)
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_subgraph_extraction_DiGraph(graph_file):
     gc.collect()
@@ -85,9 +85,6 @@ def test_subgraph_extraction_DiGraph(graph_file):
     assert compare_edges(cu_sg, nx_sg)
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
-
-
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_subgraph_extraction_Graph(graph_file):
     gc.collect()
@@ -123,10 +120,45 @@ def test_subgraph_extraction_Graph_nx(graph_file):
         )
 
     nx_sub = nx.subgraph(G, verts)
-    nx_df = nx.to_pandas_edgelist(nx_sub).to_dict()
 
     cu_verts = cudf.Series(verts)
     cu_sub = cugraph.subgraph(G, cu_verts)
-    cu_df = nx.to_pandas_edgelist(cu_sub).to_dict()
 
-    assert nx_df == cu_df
+    for (u, v) in cu_sub.edges():
+        assert nx_sub.has_edge(u, v)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_subgraph_extraction_multi_column(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+
+    cu_M = cudf.DataFrame()
+    cu_M["src_0"] = cudf.Series(M["0"])
+    cu_M["dst_0"] = cudf.Series(M["1"])
+    cu_M["src_1"] = cu_M["src_0"] + 1000
+    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    verts = cudf.Series([0, 1, 17])
+    verts_G1 = cudf.DataFrame()
+    verts_G1['v_0'] = verts
+    verts_G1['v_1'] = verts + 1000
+
+    sG1 = cugraph.subgraph(G1, verts_G1)
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
+
+    sG2 = cugraph.subgraph(G2, verts)
+
+    # FIXME: Replace with multi-column view_edge_list()
+    edgelist_df = sG1.edgelist.edgelist_df
+    edgelist_df_res = sG1.unrenumber(edgelist_df, "src")
+    edgelist_df_res = sG1.unrenumber(edgelist_df_res, "dst")
+    for i in range(len(edgelist_df_res)):
+        assert sG2.has_edge(edgelist_df_res["0_src"].iloc[i],
+                            edgelist_df_res["0_dst"].iloc[i])
diff --git a/python/cugraph/tests/test_symmetrize.py b/python/cugraph/tests/test_symmetrize.py
index 7ef8b33e97f..4080362ddfa 100644
--- a/python/cugraph/tests/test_symmetrize.py
+++ b/python/cugraph/tests/test_symmetrize.py
@@ -19,10 +19,9 @@
 import cudf
 import cugraph
 from cugraph.tests import utils
-import cugraph.comms as Comms
-from dask.distributed import Client
-from dask_cuda import LocalCUDACluster
-from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.dask.common.mg_utils import (is_single_gpu,
+                                          setup_local_dask_cluster,
+                                          teardown_local_dask_cluster)
 
 
 def test_version():
@@ -188,17 +187,11 @@ def test_symmetrize_weighted(graph_file):
     compare(cu_M["0"], cu_M["1"], cu_M["2"], sym_src, sym_dst, sym_w)
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def client_connection():
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-
+    (cluster, client) = setup_local_dask_cluster(p2p=True)
     yield client
-
-    Comms.destroy()
-    client.close()
-    cluster.close()
+    teardown_local_dask_cluster(cluster, client)
 
 
 @pytest.mark.skipif(
diff --git a/python/cugraph/tests/test_traveling_salesperson.py b/python/cugraph/tests/test_traveling_salesperson.py
new file mode 100644
index 00000000000..d43b55c43d0
--- /dev/null
+++ b/python/cugraph/tests/test_traveling_salesperson.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.tests import utils
+import cudf
+import cugraph
+import gc
+import numpy as np
+import pytest
+
+# Temporarily suppress warnings till networkX fixes deprecation warnings
+# (Using or importing the ABCs from 'collections' instead of from
+# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
+# python 3.7.  Also, this import networkx needs to be relocated in the
+# third-party group once this gets fixed.
+import warnings
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    import networkx as nx
+
+print("Networkx version : {} ".format(nx.__version__))
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+def setup_function():
+    gc.collect()
+
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+def load_tsp(filename=None):
+    gdf = cudf.read_csv(filename,
+                        delim_whitespace=True,
+                        skiprows=6,
+                        names=["vertex", "x", "y"],
+                        dtypes={"vertex": "int32",
+                                "x": "float32",
+                                "y": "float32"}
+                        )
+    gdf = gdf.dropna()
+    gdf['vertex'] = gdf['vertex'].str.strip()
+    gdf['vertex'] = gdf['vertex'].astype("int32")
+    return gdf
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+@pytest.mark.parametrize("tsplib_file, ref_cost", utils.DATASETS_TSPLIB)
+def test_traveling_salesperson(gpubenchmark, tsplib_file, ref_cost):
+    pos_list = load_tsp(tsplib_file)
+
+    cu_route, cu_cost = gpubenchmark(cugraph.traveling_salesperson,
+                                     pos_list,
+                                     restarts=4096)
+
+    print("Cugraph cost: ", cu_cost)
+    print("Ref cost: ", ref_cost)
+    error = np.abs(cu_cost - ref_cost) / ref_cost
+    print("Approximation error is: {:.2f}%".format(error * 100))
+    # Check we are within 5% of TSPLIB
+    assert(error * 100 < 5.)
+    assert(cu_route.nunique() == pos_list.shape[0])
+    assert(cu_route.shape[0] == pos_list.shape[0])
+    min_val = pos_list["vertex"].min()
+    max_val = pos_list["vertex"].max()
+    assert(cu_route.clip(min_val, max_val).shape[0] == cu_route.shape[0])
diff --git a/python/cugraph/tests/test_triangle_count.py b/python/cugraph/tests/test_triangle_count.py
index ff28f55838d..917a4f320a7 100644
--- a/python/cugraph/tests/test_triangle_count.py
+++ b/python/cugraph/tests/test_triangle_count.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,6 +19,7 @@
 import cugraph
 from cugraph.tests import utils
 
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
diff --git a/python/cugraph/tests/test_utils.py b/python/cugraph/tests/test_utils.py
index 22af649ea2e..175cf389d16 100644
--- a/python/cugraph/tests/test_utils.py
+++ b/python/cugraph/tests/test_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,17 +12,21 @@
 # limitations under the License.
 
 import gc
+from pathlib import PurePath
+
 import pytest
 
 import cugraph
+import cudf
 from cugraph.tests import utils
+import numpy as np
 
 
 def test_bfs_paths():
     with pytest.raises(ValueError) as ErrorMsg:
         gc.collect()
 
-        graph_file = '../datasets/karate.csv'
+        graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv"
 
         cu_M = utils.read_csv_file(graph_file)
 
@@ -47,7 +51,7 @@ def test_bfs_paths_array():
     with pytest.raises(ValueError) as ErrorMsg:
         gc.collect()
 
-        graph_file = '../datasets/karate.csv'
+        graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv"
 
         cu_M = utils.read_csv_file(graph_file)
 
@@ -66,3 +70,30 @@ def test_bfs_paths_array():
         answer = cugraph.utils.get_traversed_path_list(df, 100)
 
         assert "not in the result set" in str(ErrorMsg)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.skip(reason="Skipping large tests")
+def test_get_traversed_cost(graph_file):
+    cu_M = utils.read_csv_file(graph_file)
+
+    noise = cudf.Series(np.random.randint(10, size=(cu_M.shape[0])))
+    cu_M['info'] = cu_M['2'] + noise
+
+    G = cugraph.Graph()
+    G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='info')
+
+    # run SSSP starting at vertex 17
+    df = cugraph.sssp(G,  16)
+
+    answer = cugraph.utilities.path_retrieval.get_traversed_cost(df, 16,
+                                                                 cu_M['0'],
+                                                                 cu_M['1'],
+                                                                 cu_M['info']
+                                                                 )
+
+    df = df.sort_values(by='vertex').reset_index()
+    answer = answer.sort_values(by='vertex').reset_index()
+
+    assert df.shape[0] == answer.shape[0]
+    assert np.allclose(df['distance'], answer['info'])
diff --git a/python/cugraph/tests/test_wjaccard.py b/python/cugraph/tests/test_wjaccard.py
index c5cab18484c..f3b3fb9efd6 100644
--- a/python/cugraph/tests/test_wjaccard.py
+++ b/python/cugraph/tests/test_wjaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,6 +21,7 @@
 import cugraph
 from cugraph.tests import utils
 
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
@@ -37,16 +38,19 @@
 
 def cugraph_call(cu_M):
     # Device data
-    weights_arr = cudf.Series(
+    weight_arr = cudf.Series(
         np.ones(max(cu_M["0"].max(), cu_M["1"].max()) + 1, dtype=np.float32)
     )
+    weights = cudf.DataFrame()
+    weights['vertex'] = np.arange(len(weight_arr), dtype=np.int32)
+    weights['weight'] = weight_arr
 
     G = cugraph.Graph()
     G.from_cudf_edgelist(cu_M, source="0", destination="1")
 
     # cugraph Jaccard Call
     t1 = time.time()
-    df = cugraph.jaccard_w(G, weights_arr)
+    df = cugraph.jaccard_w(G, weights)
     t2 = time.time() - t1
     print("Time : " + str(t2))
 
@@ -99,3 +103,56 @@ def test_wjaccard(graph_file):
     for i in range(len(cu_coeff)):
         diff = abs(nx_coeff[i] - cu_coeff[i])
         assert diff < 1.0e-6
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_wjaccard_multi_column_weights(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    cu_M = utils.read_csv_file(graph_file)
+    # suppress F841 (local variable is assigned but never used) in flake8
+    # no networkX equivalent to compare cu_coeff against...
+    cu_coeff = cugraph_call(cu_M)  # noqa: F841
+    nx_coeff = networkx_call(M)
+    for i in range(len(cu_coeff)):
+        diff = abs(nx_coeff[i] - cu_coeff[i])
+        assert diff < 1.0e-6
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_wjaccard_multi_column(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+
+    cu_M = cudf.DataFrame()
+    cu_M["src_0"] = cudf.Series(M["0"])
+    cu_M["dst_0"] = cudf.Series(M["1"])
+    cu_M["src_1"] = cu_M["src_0"] + 1000
+    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0",
+                          destination="dst_0")
+
+    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = vertex_pair[:5]
+
+    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(),
+                                     dtype=np.float32))
+    weights = cudf.DataFrame()
+    weights['vertex'] = G2.nodes()
+    weights['vertex_'] = weights['vertex'] + 1000
+    weights['weight'] = weight_arr
+
+    df_res = cugraph.jaccard_w(G1, weights, vertex_pair)
+
+    weights = weights[['vertex', 'weight']]
+    df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
+
+    # Calculating mismatch
+    assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"])
diff --git a/python/cugraph/tests/test_woverlap.py b/python/cugraph/tests/test_woverlap.py
index e7da21014ba..66032ac3f48 100644
--- a/python/cugraph/tests/test_woverlap.py
+++ b/python/cugraph/tests/test_woverlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,10 +16,11 @@
 
 import pytest
 import scipy
+import numpy as np
 import cudf
+
 import cugraph
 from cugraph.tests import utils
-import numpy as np
 
 
 def cugraph_call(cu_M, pairs):
@@ -27,13 +28,16 @@ def cugraph_call(cu_M, pairs):
     weights_arr = cudf.Series(
         np.ones(max(cu_M["0"].max(), cu_M["1"].max()) + 1, dtype=np.float32)
     )
+    weights = cudf.DataFrame()
+    weights['vertex'] = np.arange(len(weights_arr), dtype=np.int32)
+    weights['weight'] = weights_arr
 
     G = cugraph.DiGraph()
     G.from_cudf_edgelist(cu_M, source="0", destination="1")
 
     # cugraph Overlap Call
     t1 = time.time()
-    df = cugraph.overlap_w(G, weights_arr, pairs)
+    df = cugraph.overlap_w(G, weights, pairs)
     t2 = time.time() - t1
     print("Time : " + str(t2))
     df = df.sort_values(by=["source", "destination"])
@@ -83,7 +87,6 @@ def cpu_call(M, first, second):
     return result
 
 
-# Test
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
 def test_woverlap(graph_file):
     gc.collect()
@@ -114,3 +117,42 @@ def test_woverlap(graph_file):
         else:
             diff = abs(cpu_coeff[i] - cu_coeff[i])
             assert diff < 1.0e-6
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_woverlap_multi_column(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+
+    cu_M = cudf.DataFrame()
+    cu_M["src_0"] = cudf.Series(M["0"])
+    cu_M["dst_0"] = cudf.Series(M["1"])
+    cu_M["src_1"] = cu_M["src_0"] + 1000
+    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    G1 = cugraph.Graph()
+    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
+                          destination=["dst_0", "dst_1"])
+
+    G2 = cugraph.Graph()
+    G2.from_cudf_edgelist(cu_M, source="src_0",
+                          destination="dst_0")
+
+    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = vertex_pair[:5]
+
+    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(),
+                                     dtype=np.float32))
+
+    weights = cudf.DataFrame()
+    weights['vertex'] = G2.nodes()
+    weights['vertex_'] = weights['vertex'] + 1000
+    weights['weight'] = weight_arr
+
+    df_res = cugraph.overlap_w(G1, weights, vertex_pair)
+
+    weights = weights[['vertex', 'weight']]
+    df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
+
+    # Calculating mismatch
+    assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"])
diff --git a/python/cugraph/tests/utils.py b/python/cugraph/tests/utils.py
old mode 100644
new mode 100755
index 164c6efb084..56e90b1f6bb
--- a/python/cugraph/tests/utils.py
+++ b/python/cugraph/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,7 +26,7 @@
 from scipy.sparse.coo import coo_matrix as sp_coo_matrix
 from scipy.sparse.csr import csr_matrix as sp_csr_matrix
 from scipy.sparse.csc import csc_matrix as sp_csc_matrix
-
+from pathlib import PurePath
 import cudf
 import dask_cudf
 
@@ -40,37 +40,74 @@
 #
 # Datasets
 #
-DATASETS_UNDIRECTED = ["../datasets/karate.csv", "../datasets/dolphins.csv"]
+
+
+RAPIDS_DATASET_ROOT_DIR = os.getenv("RAPIDS_DATASET_ROOT_DIR", "../datasets")
+
+DATASETS_UNDIRECTED = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for
+                       f in ["karate.csv", "dolphins.csv"]]
 
 DATASETS_UNDIRECTED_WEIGHTS = [
-    "../datasets/netscience.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/"netscience.csv"
 ]
 
-DATASETS_UNRENUMBERED = ["../datasets/karate-disjoint.csv"]
+DATASETS_UNRENUMBERED = [PurePath(
+    RAPIDS_DATASET_ROOT_DIR)/"karate-disjoint.csv"
+]
+
+DATASETS = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+    "karate-disjoint.csv",
+    "dolphins.csv",
+    "netscience.csv"]
+]
+
+DATASETS_MULTI_EDGES = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+    "karate_multi_edge.csv",
+    "dolphins_multi_edge.csv"]
+]
 
-DATASETS = [
-    "../datasets/karate-disjoint.csv",
-    "../datasets/dolphins.csv",
-    "../datasets/netscience.csv",
+DATASETS_STR_ISLT_V = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+    "karate_mod.mtx",
+    "karate_str.mtx"]
 ]
+
+DATASETS_SELF_LOOPS = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+    "karate_s_loop.csv",
+    "dolphins_s_loop.csv"]
+]
+
+
 #            '../datasets/email-Eu-core.csv']
 
 STRONGDATASETS = [
-    "../datasets/dolphins.csv",
-    "../datasets/netscience.csv",
-    "../datasets/email-Eu-core.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+        "dolphins.csv",
+        "netscience.csv",
+        "email-Eu-core.csv"]
 ]
 
-DATASETS_KTRUSS = [
-    ("../datasets/polbooks.csv", "../datasets/ref/ktruss/polbooks.csv")
+
+DATASETS_KTRUSS = [(
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/"polbooks.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/"ref/ktruss/polbooks.csv")
+]
+
+DATASETS_TSPLIB = [
+        (PurePath(RAPIDS_DATASET_ROOT_DIR)/f,) + (d,) for (f, d) in [
+            ("gil262.tsp", 2378),
+            ("eil51.tsp", 426),
+            ("kroA100.tsp", 21282),
+            ("tsp225.tsp", 3916)]
 ]
 
 DATASETS_SMALL = [
-    "../datasets/karate.csv",
-    "../datasets/dolphins.csv",
-    "../datasets/polbooks.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+        "karate.csv",
+        "dolphins.csv",
+        "polbooks.csv"]
 ]
 
+
 MATRIX_INPUT_TYPES = [
     pytest.param(
         cp_coo_matrix, marks=pytest.mark.matrix_types, id="CuPy.coo_matrix"
diff --git a/python/cugraph/traversal/__init__.py b/python/cugraph/traversal/__init__.py
index 58e37a7add0..e74266d29fc 100644
--- a/python/cugraph/traversal/__init__.py
+++ b/python/cugraph/traversal/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,5 +17,8 @@
     sssp,
     shortest_path,
     filter_unreachable,
-    shortest_path_length
-)
\ No newline at end of file
+    shortest_path_length,
+)
+from cugraph.traversal.traveling_salesperson import traveling_salesperson
+
+from cugraph.traversal.ms_bfs import concurrent_bfs, multi_source_bfs
diff --git a/python/cugraph/traversal/bfs.pxd b/python/cugraph/traversal/bfs.pxd
index 5b73d23045c..ac825deffa6 100644
--- a/python/cugraph/traversal/bfs.pxd
+++ b/python/cugraph/traversal/bfs.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,17 +16,19 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from libcpp cimport bool
 
+cdef extern from "limits.h":
+    cdef int INT_MAX
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
     cdef void call_bfs[vertex_t, weight_t](
         const handle_t &handle,
         const graph_container_t &g,
         vertex_t *identifiers,
         vertex_t *distances,
         vertex_t *predecessors,
-        double *sp_counters,
+        vertex_t depth_limit,
         const vertex_t start_vertex,
-        bool directed) except +
+        bool direction_optimizing) except +
diff --git a/python/cugraph/traversal/bfs.py b/python/cugraph/traversal/bfs.py
index efbae095676..d397b5a4241 100644
--- a/python/cugraph/traversal/bfs.py
+++ b/python/cugraph/traversal/bfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 import cudf
 
 from cugraph.traversal import bfs_wrapper
-from cugraph.structure.graph import Graph, DiGraph
+from cugraph.structure.graph_classes import Graph, DiGraph
 from cugraph.utilities import (ensure_cugraph_obj,
                                is_matrix_type,
                                is_cp_matrix_type,
@@ -41,7 +41,7 @@
                                 import_from="scipy.sparse.csc")
 
 
-def _ensure_args(G, start, return_sp_counter, i_start, directed):
+def _ensure_args(G, start, i_start, directed):
     """
     Ensures the args passed in are usable for the API api_name and returns the
     args with proper defaults if not specified, or raises TypeError or
@@ -52,9 +52,6 @@ def _ensure_args(G, start, return_sp_counter, i_start, directed):
         raise TypeError("cannot specify both 'start' and 'i_start'")
     if (start is None) and (i_start is None):
         raise TypeError("must specify 'start' or 'i_start', but not both")
-    if (return_sp_counter is not None) and \
-       (return_sp_counter not in [True, False]):
-        raise ValueError("'return_sp_counter' must be a bool")
 
     G_type = type(G)
     # Check for Graph-type inputs
@@ -67,10 +64,8 @@ def _ensure_args(G, start, return_sp_counter, i_start, directed):
     start = start if start is not None else i_start
     if directed is None:
         directed = True
-    if return_sp_counter is None:
-        return_sp_counter = False
 
-    return (start, return_sp_counter, directed)
+    return (start, directed)
 
 
 def _convert_df_to_output_type(df, input_type):
@@ -92,30 +87,23 @@ def _convert_df_to_output_type(df, input_type):
         if is_cp_matrix_type(input_type):
             distances = cp.fromDlpack(sorted_df["distance"].to_dlpack())
             preds = cp.fromDlpack(sorted_df["predecessor"].to_dlpack())
-            if "sp_counter" in df.columns:
-                return (distances, preds,
-                        cp.fromDlpack(sorted_df["sp_counter"].to_dlpack()))
-            else:
-                return (distances, preds)
+            return (distances, preds)
         else:
             distances = sorted_df["distance"].to_array()
             preds = sorted_df["predecessor"].to_array()
-            if "sp_counter" in df.columns:
-                return (distances, preds,
-                        sorted_df["sp_counter"].to_array())
-            else:
-                return (distances, preds)
+            return (distances, preds)
     else:
         raise TypeError(f"input type {input_type} is not a supported type.")
 
 
 def bfs(G,
         start=None,
-        return_sp_counter=None,
+        depth_limit=None,
         i_start=None,
         directed=None,
         return_predecessors=None):
-    """Find the distances and predecessors for a breadth first traversal of a
+    """
+    Find the distances and predecessors for a breadth first traversal of a
     graph.
 
     Parameters
@@ -128,16 +116,18 @@ def bfs(G,
     start : Integer
         The index of the graph vertex from which the traversal begins
 
-    return_sp_counter : bool, optional, default=False
-        Indicates if shortest path counters should be returned
-
     i_start : Integer, optional
         Identical to start, added for API compatibility. Only start or i_start
         can be set, not both.
 
+    depth_limit : Integer or None
+        Limit the depth of the search
+
     directed : bool, optional
-        NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises
-              TypeError if used with a Graph object.
+        NOTE
+            For non-Graph-type (eg. sparse matrix) values of G only. Raises
+            TypeError if used with a Graph object.
+
         If True (default), then convert the input matrix to a cugraph.DiGraph,
         otherwise a cugraph.Graph object will be used.
 
@@ -154,10 +144,6 @@ def bfs(G,
           df['predecessor'] for each i'th position in the column, the vertex ID
           immediately preceding the vertex at position i in the 'vertex' column
 
-          df['sp_counter'] for each i'th position in the column, the number of
-          shortest paths leading to the vertex at position i in the 'vertex'
-          column (Only if retrun_sp_counter is True)
-
     If G is a networkx.Graph, returns:
 
        pandas.DataFrame with contents equivalent to the cudf.DataFrame
@@ -189,34 +175,30 @@ def bfs(G,
     >>> df = cugraph.bfs(G, 0)
 
     """
-    (start, return_sp_counter, directed) = \
-        _ensure_args(G, start, return_sp_counter, i_start, directed)
+    (start, directed) = \
+        _ensure_args(G, start, i_start, directed)
 
     # FIXME: allow nx_weight_attr to be specified
     (G, input_type) = ensure_cugraph_obj(
         G, nx_weight_attr="weight",
         matrix_graph_type=DiGraph if directed else Graph)
 
-    if type(G) is Graph:
-        is_directed = False
-    else:
-        is_directed = True
-
     if G.renumbered is True:
-        start = G.lookup_internal_vertex_id(cudf.Series([start]))[0]
-
-    df = bfs_wrapper.bfs(G, start, is_directed, return_sp_counter)
+        if isinstance(start, cudf.DataFrame):
+            start = G.lookup_internal_vertex_id(start, start.columns).iloc[0]
+        else:
+            start = G.lookup_internal_vertex_id(cudf.Series([start]))[0]
 
+    df = bfs_wrapper.bfs(G, start, depth_limit)
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
         df = G.unrenumber(df, "predecessor")
-        df["predecessor"].fillna(-1, inplace=True)
+        df.fillna(-1, inplace=True)
 
     return _convert_df_to_output_type(df, input_type)
 
 
-def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None,
-              return_sp_counter=False):
+def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     """
     Find the distances and predecessors for a breadth first traversal of a
     graph.
@@ -237,14 +219,10 @@ def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None,
 
     depth_limit : Int or None
         Limit the depth of the search
-        Currently not implemented
 
     sort_neighbors : None or Function
         Currently not implemented
 
-    return_sp_counter : bool, optional, default=False
-        Indicates if shortest path counters should be returned
-
     Returns
     -------
     Return value type is based on the input type.  If G is a cugraph.Graph,
@@ -258,10 +236,6 @@ def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None,
           df['predecessor'] for each i'th position in the column, the vertex ID
           immediately preceding the vertex at position i in the 'vertex' column
 
-          df['sp_counter'] for each i'th position in the column, the number of
-          shortest paths leading to the vertex at position i in the 'vertex'
-          column (Only if retrun_sp_counter is True)
-
     If G is a networkx.Graph, returns:
 
        pandas.DataFrame with contents equivalent to the cudf.DataFrame
@@ -298,9 +272,4 @@ def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None,
             "reverse processing of graph is currently not supported"
         )
 
-    if depth_limit is not None:
-        raise NotImplementedError(
-            "depth limit implementation of BFS is not currently supported"
-        )
-
-    return bfs(G, source, return_sp_counter)
+    return bfs(G, source, depth_limit)
diff --git a/python/cugraph/traversal/bfs_wrapper.pyx b/python/cugraph/traversal/bfs_wrapper.pyx
index ae346aea953..1896415b1e3 100644
--- a/python/cugraph/traversal/bfs_wrapper.pyx
+++ b/python/cugraph/traversal/bfs_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,64 +17,51 @@
 # cython: language_level = 3
 
 cimport cugraph.traversal.bfs as c_bfs
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-from libc.float cimport FLT_MAX_EXP
-
 import cudf
-import rmm
 import numpy as np
 
-def bfs(input_graph, start, directed=True,
-        return_sp_counter=False):
+def bfs(input_graph, start, depth_limit, direction_optimizing=False):
     """
     Call bfs
     """
     # Step 1: Declare the different varibales
     cdef graph_container_t graph_container
-    # FIXME: Offsets and indices are currently hardcoded to int, but this may
-    #        not be acceptable in the future.
+
     numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
                      np.dtype("int64") : <int>numberTypeEnum.int64Type,
                      np.dtype("float32") : <int>numberTypeEnum.floatType,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
-    # Pointers required for CSR Graph
-    cdef uintptr_t c_offsets_ptr        = <uintptr_t> NULL # Pointer to the CSR offsets
-    cdef uintptr_t c_indices_ptr        = <uintptr_t> NULL # Pointer to the CSR indices
-    cdef uintptr_t c_weights = <uintptr_t>NULL
-    cdef uintptr_t c_local_verts = <uintptr_t> NULL;
-    cdef uintptr_t c_local_edges = <uintptr_t> NULL;
-    cdef uintptr_t c_local_offsets = <uintptr_t> NULL;
     weight_t = np.dtype("float32")
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32])
+    weights = None
 
     # Pointers for SSSP / BFS
     cdef uintptr_t c_identifier_ptr     = <uintptr_t> NULL # Pointer to the DataFrame 'vertex' Series
     cdef uintptr_t c_distance_ptr       = <uintptr_t> NULL # Pointer to the DataFrame 'distance' Series
     cdef uintptr_t c_predecessor_ptr    = <uintptr_t> NULL # Pointer to the DataFrame 'predecessor' Series
-    cdef uintptr_t c_sp_counter_ptr     = <uintptr_t> NULL # Pointer to the DataFrame 'sp_counter' Series
+    if depth_limit is None:
+        depth_limit = c_bfs.INT_MAX
 
     # Step 2: Verifiy input_graph has the expected format
-    if input_graph.adjlist is None:
-        input_graph.view_adj_list()
 
     cdef unique_ptr[handle_t] handle_ptr
     handle_ptr.reset(new handle_t())
     handle_ = handle_ptr.get();
 
-    # Step 3: Extract CSR offsets, indices, weights are not expected
-    #         - offsets: int (signed, 32-bit)
-    #         - indices: int (signed, 32-bit)
-    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-    c_offsets_ptr = offsets.__cuda_array_interface__['data'][0]
-    c_indices_ptr = indices.__cuda_array_interface__['data'][0]
-
-    # Step 4: Setup number of vertices and edges
+    # Step 3: Setup number of vertices and edges
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
+    # Step 4: Extract COO
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+
     # Step 5: Check if source index is valid
     if not 0 <= start < num_verts:
         raise ValueError("Starting vertex should be between 0 to number of vertices")
@@ -82,30 +69,33 @@ def bfs(input_graph, start, directed=True,
     # Step 6: Generate the cudf.DataFrame result
     #         Current implementation expects int (signed 32-bit) for distance
     df = cudf.DataFrame()
-    df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    df['vertex'] = cudf.Series(np.arange(num_verts), dtype=np.int32)
     df['distance'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
     df['predecessor'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
-    if (return_sp_counter):
-        df['sp_counter'] = cudf.Series(np.zeros(num_verts, dtype=np.double))
 
     # Step 7: Associate <uintptr_t> to cudf Series
     c_identifier_ptr = df['vertex'].__cuda_array_interface__['data'][0]
     c_distance_ptr = df['distance'].__cuda_array_interface__['data'][0]
     c_predecessor_ptr = df['predecessor'].__cuda_array_interface__['data'][0]
-    if return_sp_counter:
-        c_sp_counter_ptr = df['sp_counter'].__cuda_array_interface__['data'][0]
+
+    is_symmetric = not input_graph.is_directed()
 
     # Step 8: Proceed to BFS
-    # FIXME: [int, int, float] or may add an explicit [int, int, int] in graph.cu?
-    populate_graph_container_legacy(graph_container,
-                                    <graphTypeEnum>(<int>(graphTypeEnum.LegacyCSR)),
-                                    handle_[0],
-                                    <void*>c_offsets_ptr, <void*>c_indices_ptr, <void*>c_weights,
-                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
-                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
-                                    <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                                    num_verts, num_edges,
-                                    <int*>c_local_verts, <int*>c_local_edges, <int*>c_local_offsets)
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <void*>NULL,
+                             0,
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_edges,
+                             num_verts, num_edges,
+                             False,
+                             is_symmetric,
+                             False,
+                             False)
 
     # Different pathing wether shortest_path_counting is required or not
     c_bfs.call_bfs[int, float](handle_ptr.get()[0],
@@ -113,8 +103,8 @@ def bfs(input_graph, start, directed=True,
                                <int*> c_identifier_ptr,
                                <int*> c_distance_ptr,
                                <int*> c_predecessor_ptr,
-                               <double*> c_sp_counter_ptr,
+                               <int> depth_limit,
                                <int> start,
-                               directed)
+                               direction_optimizing)
 
     return df
diff --git a/python/cugraph/traversal/ms_bfs.py b/python/cugraph/traversal/ms_bfs.py
new file mode 100644
index 00000000000..3d158524751
--- /dev/null
+++ b/python/cugraph/traversal/ms_bfs.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cudf
+
+import warnings
+
+
+def _get_feasibility(G, sources, components=None, depth_limit=None):
+    """
+    Evaluate the feasibility for breadth first traversal from multiple sources
+    in a graph.
+
+    Parameters
+    ----------
+    G : cugraph.Graph or cugraph.DiGraph
+        The adjacency list will be computed if not already present.
+
+    sources :  cudf.Series
+        Subset of vertices from which the traversals start. A BFS is run for
+        each source in the Series.
+        The size of the series should be at least one and cannot exceed
+        the size of the graph.
+
+    depth_limit : Integer, optional, default=None
+        Limit the depth of the search. Terminates if no more vertices are
+        reachable within the distance of depth_limit
+
+    components : cudf.DataFrame, optional, default=None
+        GPU Dataframe containing the component information.
+        Passing this information may impact the return type.
+        When no component information is passed BFS uses one component
+        behavior settings.
+
+        components['vertex'] : cudf.Series
+            vertex IDs
+        components['color'] : cudf.Series
+            component IDs/color for vertices.
+
+    Returns
+    -------
+    mem_footprint : integer
+        Estimated memory foot print size in Bytes
+    """
+
+    # Fixme not implemented in RMM yet
+    # using 96GB upper bound for now
+    # mem = get_device_memory_info()
+    mem = 9.6e10
+    n_sources = sources.size
+    V = G.number_of_vertices()
+    E = G.number_of_edges()
+    mean_component_sz = V
+    n_components = 1
+
+    # Retreive types
+    size_of_v = 4
+    size_of_e = 4
+    size_of_w = 0
+    if G.adjlist.weights is not None:
+        if G.adjlist.weights.dtype is np.float64:
+            size_of_w = 8
+        else:
+            size_of_w = 4
+    if G.adjlist.offsets.dtype is np.float64:
+        size_of_v = 8
+    if G.adjlist.indices.dtype is np.float64:
+        size_of_e = 8
+
+    # Graph size
+    G_sz = E * size_of_e + E * size_of_w + V * size_of_v
+
+    # The impact of depth limit depends on the sparsity
+    # pattern and diameter. We cannot leverage it without
+    # traversing the full dataset a the moment.
+
+    # dense output
+    output_sz = n_sources * 2 * V * size_of_v
+
+    # sparse output
+    if components is not None:
+        tmp = components["color"].value_counts()
+        n_components = tmp.size
+        if n_sources / n_components > 100:
+            warnings.warn(
+                "High number of seeds per component result in large output."
+            )
+        mean_component_sz = tmp.mean()
+        output_sz = mean_component_sz * n_sources * 2 * size_of_e
+
+    # counting 10% for context, handle and temporary allocations
+    mem_footprint = (G_sz + output_sz) * 1.1
+    if mem_footprint > mem:
+        warnings.warn(f"Cannot execute in-memory :{mem_footprint} Bytes")
+
+    return mem_footprint
+
+
+def concurrent_bfs(Graphs, sources, depth_limit=None, offload=False):
+    """
+    Find the breadth first traversals of multiple graphs with multiple sources
+    in each graph.
+
+    Parameters
+    ----------
+    Graphs : list of cugraph.Graph or cugraph.DiGraph
+        The adjacency lists will be computed if not already present.
+
+    sources : list of cudf.Series
+        For each graph, subset of vertices from which the traversals start.
+        A BFS is run in Graphs[i] for each source in the Series at sources[i].
+        The size of this list must match the size of the graph list.
+        The size of each Series (ie. the number of sources per graph)
+        is flexible, but cannot exceed the size of the corresponding graph.
+
+
+    depth_limit : Integer, optional, default=None
+        Limit the depth of the search. Terminates if no more vertices are
+        reachable within the distance of depth_limit
+
+    offload : boolean, optional, default=False
+        Indicates if output should be written to the disk.
+        When not provided, the algorithms decides if offloading is needed
+        based on the input parameters.
+
+    Returns
+    -------
+    Return type is decided based on the input parameters (size of
+    sources, size of the graph, number of graphs and offload setting)
+
+    If G is a cugraph.Graph and output fits in memory:
+        BFS_edge_lists : cudf.DataFrame
+            GPU data frame containing all BFS edges
+        source_offsets: cudf.Series
+            Series containing the starting offset in the returned edge list
+            for each source.
+
+    If offload is True, or if the output does not fit in memory :
+        Writes csv files containing BFS output to the disk.
+    """
+    raise NotImplementedError(
+        "concurrent_bfs is coming soon! Please up vote the github issue 1465\
+             to help us prioritize"
+    )
+    if not isinstance(Graphs, list):
+        raise TypeError(
+            "Graphs should be a list of cugraph.Graph or cugraph.DiGraph"
+        )
+    if not isinstance(sources, list):
+        raise TypeError("sources should be a list of cudf.Series")
+    if len(Graphs) != len(sources):
+        raise ValueError(
+            "The size of the sources list must match\
+             the size of the graph list."
+        )
+    if offload is True:
+        raise NotImplementedError(
+            "Offloading is coming soon! Please up vote the github issue 1461\
+             to help us prioritize"
+        )
+
+    # Consolidate graphs in a single graph and record components
+
+    # Renumber and concatenate sources in a single df
+
+    # Call multi_source_bfs
+    # multi_source_bfs(
+    #    G,
+    #    sources,
+    #    components=components,
+    #    depth_limit=depth_limit,
+    #    offload=offload,
+    # )
+
+
+def multi_source_bfs(
+    G, sources, components=None, depth_limit=None, offload=False
+):
+    """
+    Find the breadth first traversal from multiple sources in a graph.
+
+    Parameters
+    ----------
+    G : cugraph.Graph or cugraph.DiGraph
+        The adjacency list will be computed if not already present.
+
+    sources :  cudf.Series
+        Subset of vertices from which the traversals start. A BFS is run for
+        each source in the Series.
+        The size of the series should be at least one and cannot exceed the
+        size of the graph.
+
+    depth_limit : Integer, optional, default=None
+        Limit the depth of the search. Terminates if no more vertices are
+        reachable within the distance of depth_limit
+
+    components : cudf.DataFrame, optional, default=None
+        GPU Dataframe containing the component information.
+        Passing this information may impact the return type.
+        When no component information is passed BFS uses one component
+        behavior settings.
+
+        components['vertex'] : cudf.Series
+            vertex IDs
+        components['color'] : cudf.Series
+            component IDs/color for vertices.
+
+    offload : boolean, optional, default=False
+        Indicates if output should be written to the disk.
+        When not provided, the algorithms decides if offloading is needed
+        based on the input parameters.
+
+    Returns
+    -------
+    Return value type is decided based on the input parameters (size of
+    sources, size of the graph, number of components and offload setting)
+    If G is a cugraph.Graph, returns :
+       cudf.DataFrame
+          df['vertex'] vertex IDs
+
+          df['distance_<source>'] path distance for each vertex from the
+          starting vertex. One column per source.
+
+          df['predecessor_<source>'] for each i'th position in the column,
+          the vertex ID immediately preceding the vertex at position i in
+          the 'vertex' column. One column per source.
+
+    If G is a cugraph.Graph and component information is present returns :
+        BFS_edge_lists : cudf.DataFrame
+            GPU data frame containing all BFS edges
+        source_offsets: cudf.Series
+            Series containing the starting offset in the returned edge list
+            for each source.
+
+    If offload is True, or if the output does not fit in memory :
+        Writes csv files containing BFS output to the disk.
+    """
+    raise NotImplementedError(
+        "concurrent_bfs is coming soon! Please up vote the github issue 1465\
+             to help us prioritize"
+    )
+    # if components is not None:
+    #    null_check(components["vertex"])
+    #    null_check(components["colors"])
+    #
+    # if depth_limit is not None:
+    #    raise NotImplementedError(
+    #        "depth limit implementation of BFS is not currently supported"
+    #    )
+
+    # if offload is True:
+    #    raise NotImplementedError(
+    #        "Offloading is coming soon! Please up vote the github issue 1461
+    #         to help us prioritize"
+    #    )
+    if isinstance(sources, list):
+        sources = cudf.Series(sources)
+    if G.renumbered is True:
+        sources = G.lookup_internal_vertex_id(cudf.Series(sources))
+    if not G.adjlist:
+        G.view_adj_list()
+    # Memory footprint check
+    footprint = _get_feasibility(
+        G, sources, components=components, depth_limit=depth_limit
+    )
+    print(footprint)
+    # Call multi_source_bfs
+    # FIXME remove when implemented
+    # raise NotImplementedError("Commming soon")
diff --git a/python/cugraph/traversal/sssp.pxd b/python/cugraph/traversal/sssp.pxd
index e4b709cb879..3109668d747 100644
--- a/python/cugraph/traversal/sssp.pxd
+++ b/python/cugraph/traversal/sssp.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,9 +16,9 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 
-cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
     cdef void call_sssp[vertex_t, weight_t](
         const handle_t &handle,
diff --git a/python/cugraph/traversal/sssp.py b/python/cugraph/traversal/sssp.py
index 4ba754ad4ed..f3aebaf43bf 100644
--- a/python/cugraph/traversal/sssp.py
+++ b/python/cugraph/traversal/sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 import numpy as np
 
 import cudf
-from cugraph.structure import Graph, DiGraph
+from cugraph.structure import Graph, DiGraph, MultiGraph, MultiDiGraph
 from cugraph.traversal import sssp_wrapper
 from cugraph.utilities import (ensure_cugraph_obj,
                                is_matrix_type,
@@ -104,7 +104,7 @@ def _convert_df_to_output_type(df, input_type, return_predecessors):
     return_predecessors is only used for return values from cupy/scipy input
     types.
     """
-    if input_type in [Graph, DiGraph]:
+    if input_type in [Graph, DiGraph, MultiGraph, MultiDiGraph]:
         return df
 
     elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
@@ -212,7 +212,11 @@ def sssp(G,
         matrix_graph_type=DiGraph if directed else Graph)
 
     if G.renumbered:
-        source = G.lookup_internal_vertex_id(cudf.Series([source]))[0]
+        if isinstance(source, cudf.DataFrame):
+            source = G.lookup_internal_vertex_id(
+                source, source.columns).iloc[0]
+        else:
+            source = G.lookup_internal_vertex_id(cudf.Series([source]))[0]
 
     if source is cudf.NA:
         raise ValueError(
@@ -223,7 +227,7 @@ def sssp(G,
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
         df = G.unrenumber(df, "predecessor")
-        df["predecessor"].fillna(-1, inplace=True)
+        df.fillna(-1, inplace=True)
 
     return _convert_df_to_output_type(df, input_type, return_predecessors)
 
diff --git a/python/cugraph/traversal/sssp_wrapper.pyx b/python/cugraph/traversal/sssp_wrapper.pyx
index 730fe0db94e..46966cd3e99 100644
--- a/python/cugraph/traversal/sssp_wrapper.pyx
+++ b/python/cugraph/traversal/sssp_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,17 +18,14 @@
 
 cimport cugraph.traversal.sssp as c_sssp
 cimport cugraph.traversal.bfs as c_bfs
-from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_utilities cimport *
 from cugraph.structure import graph_primtypes_wrapper
-
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-from libc.float cimport FLT_MAX_EXP
-
 import cudf
-import rmm
 import numpy as np
 
+
 def sssp(input_graph, source):
     """
     Call sssp
@@ -49,7 +46,7 @@ def sssp(input_graph, source):
     cdef uintptr_t c_local_verts = <uintptr_t> NULL;
     cdef uintptr_t c_local_edges = <uintptr_t> NULL;
     cdef uintptr_t c_local_offsets = <uintptr_t> NULL;
-    weight_t = np.dtype("int32")
+    weight_t = np.dtype("float32")
 
     # Pointers for SSSP / BFS
     cdef uintptr_t c_identifier_ptr     = <uintptr_t> NULL # Pointer to the DataFrame 'vertex' Series
@@ -113,31 +110,21 @@ def sssp(input_graph, source):
                                     num_verts, num_edges,
                                     <int*>c_local_verts, <int*>c_local_edges, <int*>c_local_offsets)
 
-    if weights is not None:
-        if weight_t == np.float32:
-            c_sssp.call_sssp[int, float](handle_[0],
-                                         graph_container,
-                                         <int*> c_identifier_ptr,
-                                         <float*> c_distance_ptr,
-                                         <int*> c_predecessor_ptr,
-                                         <int> source)
-        elif weight_t == np.float64:
-            c_sssp.call_sssp[int, double](handle_[0],
-                                          graph_container,
-                                          <int*> c_identifier_ptr,
-                                          <double*> c_distance_ptr,
-                                          <int*> c_predecessor_ptr,
-                                          <int> source)
-        else: # This case should not happen
-            raise NotImplementedError
-    else:
-        c_bfs.call_bfs[int, float](handle_[0],
-                                   graph_container,
-                                   <int*> c_identifier_ptr,
-                                   <int*> c_distance_ptr,
-                                   <int*> c_predecessor_ptr,
-                                   <double*> NULL,
-                                   <int> source,
-                                   <bool> 1)
+    if weight_t == np.float32:
+        c_sssp.call_sssp[int, float](handle_[0],
+                                     graph_container,
+                                     <int*> c_identifier_ptr,
+                                     <float*> c_distance_ptr,
+                                     <int*> c_predecessor_ptr,
+                                     <int> source)
+    elif weight_t == np.float64:
+        c_sssp.call_sssp[int, double](handle_[0],
+                                      graph_container,
+                                      <int*> c_identifier_ptr,
+                                      <double*> c_distance_ptr,
+                                      <int*> c_predecessor_ptr,
+                                      <int> source)
+    else: # This case should not happen
+        raise NotImplementedError
 
     return df
diff --git a/python/cugraph/traversal/traveling_salesperson.pxd b/python/cugraph/traversal/traveling_salesperson.pxd
new file mode 100644
index 00000000000..b38c18c7633
--- /dev/null
+++ b/python/cugraph/traversal/traveling_salesperson.pxd
@@ -0,0 +1,33 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cugraph.structure.graph_primtypes cimport *
+
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
+
+    cdef float traveling_salesperson(const handle_t &handle,
+            int *vtx_ptr,
+            float *x_pos,
+            float *y_pos,
+            int nodes,
+            int restarts,
+            bool beam_search,
+            int k,
+            int nstart,
+            bool verbose,
+            int *route) except +
diff --git a/python/cugraph/traversal/traveling_salesperson.py b/python/cugraph/traversal/traveling_salesperson.py
new file mode 100644
index 00000000000..53d411c92ae
--- /dev/null
+++ b/python/cugraph/traversal/traveling_salesperson.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.traversal import traveling_salesperson_wrapper
+from cugraph.structure.graph_classes import null_check
+import cudf
+
+
+def traveling_salesperson(pos_list,
+                          restarts=100000,
+                          beam_search=True,
+                          k=4,
+                          nstart=None,
+                          verbose=False,
+                          ):
+    """
+    Finds an approximate solution to the traveling salesperson problem (TSP).
+    cuGraph computes an approximation of the TSP problem using hill climbing
+    optimization.
+
+    The current implementation does not support a weighted graph.
+
+    Parameters
+    ----------
+    pos_list: cudf.DataFrame
+        Data frame with initial vertex positions containing three columns:
+        'vertex' ids and 'x', 'y' positions.
+    restarts: int
+        Number of starts to try. The more restarts, the better the solution
+        will be approximated. The number of restarts depends on the problem
+        size and should be kept low for instances above 2k cities.
+    beam_search: bool
+        Specify if the initial solution should use KNN for an approximation
+        solution.
+    k: int
+        Beam width to use in the search.
+    nstart: int
+        Vertex id to use as starting position.
+    verbose: bool
+        Logs configuration and iterative improvement.
+
+    Returns
+    -------
+    route : cudf.Series
+        cudf.Series of size V containing the ordered list of vertices
+        than needs to be visited.
+    """
+
+    if not isinstance(pos_list, cudf.DataFrame):
+        raise TypeError("Instance should be cudf.DataFrame")
+
+    null_check(pos_list['vertex'])
+    null_check(pos_list['x'])
+    null_check(pos_list['y'])
+
+    if nstart is not None and not pos_list[pos_list['vertex'] == nstart].index:
+        raise ValueError("nstart should be in vertex ids")
+
+    route, cost = traveling_salesperson_wrapper.traveling_salesperson(
+            pos_list,
+            restarts,
+            beam_search,
+            k,
+            nstart,
+            verbose)
+    return route, cost
diff --git a/python/cugraph/traversal/traveling_salesperson_wrapper.pyx b/python/cugraph/traversal/traveling_salesperson_wrapper.pyx
new file mode 100644
index 00000000000..6eccce57a37
--- /dev/null
+++ b/python/cugraph/traversal/traveling_salesperson_wrapper.pyx
@@ -0,0 +1,82 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cugraph.traversal.traveling_salesperson cimport traveling_salesperson as c_traveling_salesperson
+from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+from numba import cuda
+import cudf
+import numpy as np
+
+
+def traveling_salesperson(pos_list,
+                          restarts=100000,
+                          beam_search=True,
+                          k=4,
+                          nstart=None,
+                          verbose=False,
+                          renumber=True,
+):
+    """
+    Call traveling_salesperson
+    """
+
+    nodes = pos_list.shape[0]
+    cdef uintptr_t x_pos = <uintptr_t>NULL
+    cdef uintptr_t y_pos = <uintptr_t>NULL
+
+    pos_list['vertex'] = pos_list['vertex'].astype(np.int32)
+    pos_list['x'] = pos_list['x'].astype(np.float32)
+    pos_list['y'] = pos_list['y'].astype(np.float32)
+    x_pos = pos_list['x'].__cuda_array_interface__['data'][0]
+    y_pos = pos_list['y'].__cuda_array_interface__['data'][0]
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
+    cdef float final_cost = 0.0
+
+    cdef uintptr_t route_ptr = <uintptr_t>NULL
+    route_arr = cuda.device_array(nodes, dtype=np.int32)
+    route_ptr = route_arr.device_ctypes_pointer.value
+
+    cdef uintptr_t vtx_ptr = <uintptr_t>NULL
+    vtx_ptr = pos_list['vertex'].__cuda_array_interface__['data'][0]
+
+    if nstart is None:
+      renumbered_nstart = 0
+    else:
+      renumbered_nstart = pos_list[pos_list['vertex'] == nstart].index[0]
+
+    final_cost = c_traveling_salesperson(handle_[0],
+            <int*> vtx_ptr,
+            <float*> x_pos,
+            <float*> y_pos,
+            <int> nodes,
+            <int> restarts,
+            <bool> beam_search,
+            <int> k,
+            <int> renumbered_nstart,
+            <bool> verbose,
+            <int*> route_ptr)
+
+    route = cudf.Series(route_arr)
+    return route, final_cost
diff --git a/python/cugraph/tree/minimum_spanning_tree.pxd b/python/cugraph/tree/minimum_spanning_tree.pxd
index 8cea2bee0cc..32c76ede554 100644
--- a/python/cugraph/tree/minimum_spanning_tree.pxd
+++ b/python/cugraph/tree/minimum_spanning_tree.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 from cugraph.structure.graph_primtypes cimport *
 
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
+cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
 
     cdef unique_ptr[GraphCOO[VT,ET,WT]] minimum_spanning_tree[VT,ET,WT](const handle_t &handle,
         const GraphCSRView[VT,ET,WT] &graph) except +
diff --git a/python/cugraph/tree/minimum_spanning_tree.py b/python/cugraph/tree/minimum_spanning_tree.py
index 25a365665df..6a5f7b5bf38 100644
--- a/python/cugraph/tree/minimum_spanning_tree.py
+++ b/python/cugraph/tree/minimum_spanning_tree.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,12 +12,12 @@
 # limitations under the License.
 
 from cugraph.tree import minimum_spanning_tree_wrapper
-from cugraph.structure.graph import Graph
+from cugraph.structure.graph_classes import Graph
 from cugraph.utilities import check_nx_graph
 from cugraph.utilities import cugraph_to_nx
 
 
-def minimum_spanning_tree_subgraph(G):
+def _minimum_spanning_tree_subgraph(G):
     mst_subgraph = Graph()
     if type(G) is not Graph:
         raise Exception("input graph must be undirected")
@@ -32,7 +32,7 @@ def minimum_spanning_tree_subgraph(G):
     return mst_subgraph
 
 
-def maximum_spanning_tree_subgraph(G):
+def _maximum_spanning_tree_subgraph(G):
     mst_subgraph = Graph()
     if type(G) is not Graph:
         raise Exception("input graph must be undirected")
@@ -68,28 +68,33 @@ def minimum_spanning_tree(
     ----------
     G : cuGraph.Graph or networkx.Graph
         cuGraph graph descriptor with connectivity information.
+
     weight : string
         default to the weights in the graph, if the graph edges do not have a
         weight attribute a default weight of 1 will be used.
+
     algorithm : string
         Default to 'boruvka'. The parallel algorithm to use when finding a
         minimum spanning tree.
+
     ignore_nan : bool
         Default to False
+
     Returns
     -------
     G_mst : cuGraph.Graph or networkx.Graph
         A graph descriptor with a minimum spanning tree or forest.
         The networkx graph will not have all attributes copied over
+
     """
 
     G, isNx = check_nx_graph(G)
 
     if isNx is True:
-        mst = minimum_spanning_tree_subgraph(G)
+        mst = _minimum_spanning_tree_subgraph(G)
         return cugraph_to_nx(mst)
     else:
-        return minimum_spanning_tree_subgraph(G)
+        return _minimum_spanning_tree_subgraph(G)
 
 
 def maximum_spanning_tree(
@@ -103,25 +108,30 @@ def maximum_spanning_tree(
     ----------
     G : cuGraph.Graph or networkx.Graph
         cuGraph graph descriptor with connectivity information.
+
     weight : string
         default to the weights in the graph, if the graph edges do not have a
         weight attribute a default weight of 1 will be used.
+
     algorithm : string
         Default to 'boruvka'. The parallel algorithm to use when finding a
         maximum spanning tree.
+
     ignore_nan : bool
         Default to False
+
     Returns
     -------
     G_mst : cuGraph.Graph or networkx.Graph
         A graph descriptor with a maximum spanning tree or forest.
         The networkx graph will not have all attributes copied over
+
     """
 
     G, isNx = check_nx_graph(G)
 
     if isNx is True:
-        mst = maximum_spanning_tree_subgraph(G)
+        mst = _maximum_spanning_tree_subgraph(G)
         return cugraph_to_nx(mst)
     else:
-        return maximum_spanning_tree_subgraph(G)
+        return _maximum_spanning_tree_subgraph(G)
diff --git a/python/cugraph/utilities/__init__.py b/python/cugraph/utilities/__init__.py
index 61f5596eee6..6dc23ff53b7 100644
--- a/python/cugraph/utilities/__init__.py
+++ b/python/cugraph/utilities/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,4 +24,6 @@
                                      is_matrix_type,
                                      is_cp_matrix_type,
                                      is_sp_matrix_type,
+                                     renumber_vertex_pair
                                      )
+from cugraph.utilities.path_retrieval import get_traversed_cost
diff --git a/python/cugraph/utilities/path_retrieval.pxd b/python/cugraph/utilities/path_retrieval.pxd
new file mode 100644
index 00000000000..dcbbef5127d
--- /dev/null
+++ b/python/cugraph/utilities/path_retrieval.pxd
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cugraph.structure.graph_primtypes cimport *
+
+cdef extern from "cugraph/utilities/path_retrieval.hpp" namespace "cugraph":
+
+    cdef void get_traversed_cost[vertex_t, weight_t](const handle_t &handle,
+            const vertex_t *vertices,
+            const vertex_t *preds,
+            const weight_t *info_weights,
+            weight_t *out,
+            vertex_t stop_vertex,
+            vertex_t num_vertices) except +
diff --git a/python/cugraph/utilities/path_retrieval.py b/python/cugraph/utilities/path_retrieval.py
new file mode 100644
index 00000000000..b9baadc2f21
--- /dev/null
+++ b/python/cugraph/utilities/path_retrieval.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cudf
+
+from cugraph.structure.symmetrize import symmetrize
+from cugraph.structure.number_map import NumberMap
+from cugraph.utilities import path_retrieval_wrapper
+
+
+def get_traversed_cost(df, source, source_col, dest_col, value_col):
+    """
+    Take the DataFrame result from a BFS or SSSP function call and sums
+    the given weights along the path to the starting vertex.
+    The source_col, dest_col identifiers need to match with the vertex and
+    predecessor columns of df.
+
+    Input Parameters
+    ----------
+    df : cudf.DataFrame
+        The dataframe containing the results of a BFS or SSSP call
+    source: int
+        Index of the source vertex.
+    source_col : cudf.DataFrame
+        This cudf.Series wraps a gdf_column of size E (E: number of edges).
+        The gdf column contains the source index for each edge.
+        Source indices must be an integer type.
+    dest_col : cudf.Series
+        This cudf.Series wraps a gdf_column of size E (E: number of edges).
+        The gdf column contains the destination index for each edge.
+        Destination indices must be an integer type.
+    value_col : cudf.Series
+        This cudf.Series wraps a gdf_column of size E (E: number of edges).
+        The gdf column contains values associated with this edge.
+        Weight should be a floating type.
+
+    Returns
+    ---------
+    df : cudf.DataFrame
+        DataFrame containing two columns 'vertex' and 'info'.
+        Unreachable vertices will have value the max value of the weight type.
+    """
+
+    if 'vertex' not in df.columns:
+        raise ValueError("DataFrame does not appear to be a BFS or "
+                         "SSP result - 'vertex' column missing")
+    if 'distance' not in df.columns:
+        raise ValueError("DataFrame does not appear to be a BFS or "
+                         "SSP result - 'distance' column missing")
+    if 'predecessor' not in df.columns:
+        raise ValueError("DataFrame does not appear to be a BFS or "
+                         "SSP result - 'predecessor' column missing")
+
+    src, dst, val = symmetrize(source_col,
+                               dest_col,
+                               value_col)
+
+    symmetrized_df = cudf.DataFrame()
+    symmetrized_df['source'] = src
+    symmetrized_df['destination'] = dst
+    symmetrized_df['weights'] = val
+
+    input_df = df.merge(symmetrized_df,
+                        left_on=['vertex', 'predecessor'],
+                        right_on=['source', 'destination'],
+                        how="left"
+                        )
+
+    # Set unreachable vertex weights to max float and source vertex weight to 0
+    max_val = np.finfo(val.dtype).max
+    input_df[['weights']] = input_df[['weights']].fillna(max_val)
+    input_df.loc[input_df['vertex'] == source, 'weights'] = 0
+
+    # Renumber
+    renumbered_gdf, renumber_map = NumberMap.renumber(input_df,
+                                                      ["vertex"],
+                                                      ["predecessor"],
+                                                      preserve_order=True)
+    renumbered_gdf = renumbered_gdf.rename(columns={'src': 'vertex',
+                                                    'dst': 'predecessor'})
+    stop_vertex = renumber_map.to_internal_vertex_id(cudf.Series(-1)).values[0]
+
+    out_df = path_retrieval_wrapper.get_traversed_cost(renumbered_gdf,
+                                                       stop_vertex)
+
+    # Unrenumber
+    out_df['vertex'] = renumber_map.unrenumber(renumbered_gdf, 'vertex',
+                                               preserve_order=True)["vertex"]
+    return out_df
diff --git a/python/cugraph/utilities/path_retrieval_wrapper.pyx b/python/cugraph/utilities/path_retrieval_wrapper.pyx
new file mode 100644
index 00000000000..98d11ad07df
--- /dev/null
+++ b/python/cugraph/utilities/path_retrieval_wrapper.pyx
@@ -0,0 +1,72 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cugraph.utilities.path_retrieval cimport get_traversed_cost as c_get_traversed_cost
+from cugraph.structure.graph_primtypes cimport *
+from libc.stdint cimport uintptr_t
+from numba import cuda
+import cudf
+import numpy as np
+
+
+def get_traversed_cost(input_df, stop_vertex):
+    """
+    Call get_traversed_cost
+    """
+    num_verts = input_df.shape[0]
+    vertex_t = input_df.vertex.dtype
+    weight_t = input_df.weights.dtype
+
+    df = cudf.DataFrame()
+    df['vertex'] = input_df['vertex']
+    df['info'] = cudf.Series(np.zeros(num_verts, dtype=weight_t))
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
+    cdef uintptr_t vertices = <uintptr_t>NULL
+    cdef uintptr_t preds = <uintptr_t>NULL
+    cdef uintptr_t out = <uintptr_t>NULL
+    cdef uintptr_t info_weights = <uintptr_t>NULL
+
+    vertices = input_df['vertex'].__cuda_array_interface__['data'][0]
+    preds = input_df['predecessor'].__cuda_array_interface__['data'][0]
+    info_weights = input_df['weights'].__cuda_array_interface__['data'][0]
+    out = df['info'].__cuda_array_interface__['data'][0]
+
+    if weight_t == np.float32:
+        c_get_traversed_cost(handle_[0],
+            <int *> vertices,
+            <int *> preds,
+            <float *> info_weights,
+            <float *> out,
+            <int> stop_vertex,
+            <int> num_verts)
+    elif weight_t == np.float64:
+        c_get_traversed_cost(handle_[0],
+            <int *> vertices,
+            <int *> preds,
+            <double *> info_weights,
+            <double *> out,
+            <int> stop_vertex,
+            <int> num_verts)
+    else:
+        raise NotImplementedError
+
+    return df
diff --git a/python/cugraph/utilities/utils.py b/python/cugraph/utilities/utils.py
index f1a320cd1ef..e4e93513630 100644
--- a/python/cugraph/utilities/utils.py
+++ b/python/cugraph/utilities/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,6 +14,11 @@
 from numba import cuda
 
 import cudf
+from rmm._cuda.gpu import (
+    getDeviceAttribute,
+    cudaDeviceAttr,
+)
+
 
 # optional dependencies
 try:
@@ -21,6 +26,7 @@
     from cupyx.scipy.sparse.coo import coo_matrix as cp_coo_matrix
     from cupyx.scipy.sparse.csr import csr_matrix as cp_csr_matrix
     from cupyx.scipy.sparse.csc import csc_matrix as cp_csc_matrix
+
     CP_MATRIX_TYPES = [cp_coo_matrix, cp_csr_matrix, cp_csc_matrix]
     CP_COMPRESSED_MATRIX_TYPES = [cp_csr_matrix, cp_csc_matrix]
 except ModuleNotFoundError:
@@ -33,6 +39,7 @@
     from scipy.sparse.coo import coo_matrix as sp_coo_matrix
     from scipy.sparse.csr import csr_matrix as sp_csr_matrix
     from scipy.sparse.csc import csc_matrix as sp_csc_matrix
+
     SP_MATRIX_TYPES = [sp_coo_matrix, sp_csr_matrix, sp_csc_matrix]
     SP_COMPRESSED_MATRIX_TYPES = [sp_csr_matrix, sp_csc_matrix]
 except ModuleNotFoundError:
@@ -55,8 +62,8 @@ def get_traversed_path(df, id):
     ----------
     df : cudf.DataFrame
         The dataframe containing the results of a BFS or SSSP call
-    id : Int
-        The vertex ID
+    id : vertex ID
+        most be the same data types as what is in the dataframe
 
     Returns
     ---------
@@ -75,33 +82,40 @@ def get_traversed_path(df, id):
     >>> path = cugraph.utils.get_traversed_path(sssp_df, 32)
     """
 
-    if 'vertex' not in df.columns:
-        raise ValueError("DataFrame does not appear to be a BFS or "
-                         "SSP result - 'vertex' column missing")
-    if 'distance' not in df.columns:
-        raise ValueError("DataFrame does not appear to be a BFS or "
-                         "SSP result - 'distance' column missing")
-    if 'predecessor' not in df.columns:
-        raise ValueError("DataFrame does not appear to be a BFS or "
-                         "SSP result - 'predecessor' column missing")
-    if type(id) != int:
-        raise ValueError("The vertex 'id' needs to be an integer")
+    if "vertex" not in df.columns:
+        raise ValueError(
+            "DataFrame does not appear to be a BFS or "
+            "SSP result - 'vertex' column missing"
+        )
+    if "distance" not in df.columns:
+        raise ValueError(
+            "DataFrame does not appear to be a BFS or "
+            "SSP result - 'distance' column missing"
+        )
+    if "predecessor" not in df.columns:
+        raise ValueError(
+            "DataFrame does not appear to be a BFS or "
+            "SSP result - 'predecessor' column missing"
+        )
+    if isinstance(id, type(df['vertex'].iloc[0])):
+        raise ValueError(
+            "The vertex 'id' needs to be the same as df['vertex']")
 
     # There is no guarantee that the dataframe has not been filtered
     # or edited.  Therefore we cannot assume that using the vertex ID
     # as an index will work
 
-    ddf = df[df['vertex'] == id]
+    ddf = df[df["vertex"] == id]
     if len(ddf) == 0:
         raise ValueError("The vertex (", id, " is not in the result set")
-    pred = ddf['predecessor'].iloc[0]
+    pred = ddf["predecessor"].iloc[0]
 
     answer = []
     answer.append(ddf)
 
     while pred != -1:
-        ddf = df[df['vertex'] == pred]
-        pred = ddf['predecessor'].iloc[0]
+        ddf = df[df["vertex"] == pred]
+        pred = ddf["predecessor"].iloc[0]
         answer.append(ddf)
 
     return cudf.concat(answer)
@@ -133,17 +147,24 @@ def get_traversed_path_list(df, id):
     >>> path = cugraph.utils.get_traversed_path_list(sssp_df, 32)
     """
 
-    if 'vertex' not in df.columns:
-        raise ValueError("DataFrame does not appear to be a BFS or "
-                         "SSP result - 'vertex' column missing")
-    if 'distance' not in df.columns:
-        raise ValueError("DataFrame does not appear to be a BFS or "
-                         "SSP result - 'distance' column missing")
-    if 'predecessor' not in df.columns:
-        raise ValueError("DataFrame does not appear to be a BFS or "
-                         "SSP result - 'predecessor' column missing")
-    if type(id) != int:
-        raise ValueError("The vertex 'id' needs to be an integer")
+    if "vertex" not in df.columns:
+        raise ValueError(
+            "DataFrame does not appear to be a BFS or "
+            "SSP result - 'vertex' column missing"
+        )
+    if "distance" not in df.columns:
+        raise ValueError(
+            "DataFrame does not appear to be a BFS or "
+            "SSP result - 'distance' column missing"
+        )
+    if "predecessor" not in df.columns:
+        raise ValueError(
+            "DataFrame does not appear to be a BFS or "
+            "SSP result - 'predecessor' column missing"
+        )
+    if isinstance(id, type(df['vertex'].iloc[0])):
+        raise ValueError(
+            "The vertex 'id' needs to be the same as df['vertex']")
 
     # There is no guarantee that the dataframe has not been filtered
     # or edited.  Therefore we cannot assume that using the vertex ID
@@ -153,17 +174,17 @@ def get_traversed_path_list(df, id):
     answer = []
     answer.append(id)
 
-    ddf = df[df['vertex'] == id]
+    ddf = df[df["vertex"] == id]
     if len(ddf) == 0:
         raise ValueError("The vertex (", id, " is not in the result set")
 
-    pred = ddf['predecessor'].iloc[0]
+    pred = ddf["predecessor"].iloc[0]
 
     while pred != -1:
         answer.append(pred)
 
-        ddf = df[df['vertex'] == pred]
-        pred = ddf['predecessor'].iloc[0]
+        ddf = df[df["vertex"] == pred]
+        pred = ddf["predecessor"].iloc[0]
 
     return answer
 
@@ -182,6 +203,33 @@ def is_cuda_version_less_than(min_version=(10, 2)):
     return False
 
 
+def is_device_version_less_than(min_version=(7, 0)):
+    """
+    Returns True if the version of CUDA being used is less than min_version
+    """
+    major_version = getDeviceAttribute(
+        cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0
+    )
+    minor_version = getDeviceAttribute(
+        cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0
+    )
+    if major_version > min_version[0]:
+        return False
+    if major_version < min_version[0]:
+        return True
+    if minor_version < min_version[1]:
+        return True
+    return False
+
+
+def get_device_memory_info():
+    """
+    Returns the total amount of global memory on the device in bytes
+    """
+    meminfo = cuda.current_context().get_memory_info()
+    return meminfo[1]
+
+
 # FIXME: if G is a Nx type, the weight attribute is assumed to be "weight", if
 # set. An additional optional parameter for the weight attr name when accepting
 # Nx graphs may be needed.  From the Nx docs:
@@ -195,39 +243,45 @@ def ensure_cugraph_obj(obj, nx_weight_attr=None, matrix_graph_type=None):
     cugraph Graph-type obj to create when converting from a matrix type.
     """
     # FIXME: importing here to avoid circular import
-    from cugraph.structure import Graph, DiGraph
+    from cugraph.structure import Graph, DiGraph, MultiGraph, MultiDiGraph
     from cugraph.utilities.nx_factory import convert_from_nx
 
     input_type = type(obj)
-    if input_type in [Graph, DiGraph]:
+    if input_type in [Graph, DiGraph, MultiGraph, MultiDiGraph]:
         return (obj, input_type)
 
     elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
         return (convert_from_nx(obj, weight=nx_weight_attr), input_type)
 
-    elif (input_type in CP_MATRIX_TYPES) or \
-         (input_type in SP_MATRIX_TYPES):
+    elif (input_type in CP_MATRIX_TYPES) or (input_type in SP_MATRIX_TYPES):
 
         if matrix_graph_type is None:
             matrix_graph_type = Graph
         elif matrix_graph_type not in [Graph, DiGraph]:
-            raise TypeError(f"matrix_graph_type must be either a cugraph "
-                            f"Graph or DiGraph, got: {matrix_graph_type}")
-
-        if input_type in (CP_COMPRESSED_MATRIX_TYPES +
-                          SP_COMPRESSED_MATRIX_TYPES):
+            raise TypeError(
+                f"matrix_graph_type must be either a cugraph "
+                f"Graph or DiGraph, got: {matrix_graph_type}"
+            )
+
+        if input_type in (
+            CP_COMPRESSED_MATRIX_TYPES + SP_COMPRESSED_MATRIX_TYPES
+        ):
             coo = obj.tocoo(copy=False)
         else:
             coo = obj
 
         if input_type in CP_MATRIX_TYPES:
-            df = cudf.DataFrame({"source": cp.ascontiguousarray(coo.row),
-                                 "destination": cp.ascontiguousarray(coo.col),
-                                 "weight": cp.ascontiguousarray(coo.data)})
+            df = cudf.DataFrame(
+                {
+                    "source": cp.ascontiguousarray(coo.row),
+                    "destination": cp.ascontiguousarray(coo.col),
+                    "weight": cp.ascontiguousarray(coo.data),
+                }
+            )
         else:
-            df = cudf.DataFrame({"source": coo.row,
-                                 "destination": coo.col,
-                                 "weight": coo.data})
+            df = cudf.DataFrame(
+                {"source": coo.row, "destination": coo.col, "weight": coo.data}
+            )
         # FIXME:
         # * do a quick check that symmetry is stored explicitly in the cupy
         #   data for sym matrices (ie. for each uv, check vu is there)
@@ -281,3 +335,23 @@ def import_optional(mod, import_from=None):
         pass
 
     return namespace.get(mod)
+
+
+def renumber_vertex_pair(input_graph, vertex_pair):
+    vertex_size = input_graph.vertex_column_size()
+    columns = vertex_pair.columns.to_list()
+    if vertex_size == 1:
+        for col in vertex_pair.columns:
+            if input_graph.renumbered:
+                vertex_pair = input_graph.add_internal_vertex_id(
+                    vertex_pair, col, col
+                )
+    else:
+        if input_graph.renumbered:
+            vertex_pair = input_graph.add_internal_vertex_id(
+                vertex_pair, "src", columns[:vertex_size]
+            )
+            vertex_pair = input_graph.add_internal_vertex_id(
+                vertex_pair, "dst", columns[vertex_size:]
+            )
+    return vertex_pair
diff --git a/python/pytest.ini b/python/pytest.ini
index 33c82fe48f7..a1933ea34aa 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -5,12 +5,15 @@ addopts =
            --benchmark-min-rounds=1
            --benchmark-columns="mean, rounds"
            --benchmark-gpu-disable
+           --cov=cugraph
+           --cov-report term-missing:skip-covered
 
 markers =
           managedmem_on: RMM managed memory enabled
           managedmem_off: RMM managed memory disabled
           poolallocator_on: RMM pool allocator enabled
           poolallocator_off: RMM pool allocator disabled
+          preset_gpu_count: Use a hard-coded number of GPUs for specific MG tests
           ETL: benchmarks for ETL steps
           small: small datasets
           tiny: tiny datasets
diff --git a/python/setup.py b/python/setup.py
index d99ff12cfa1..a2dde239cca 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,29 +13,51 @@
 
 import os
 import sys
+import sysconfig
 import shutil
 
-from setuptools import setup, find_packages, Command
-from setuptools.extension import Extension
-from setuputils import use_raft_package, get_environment_option
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
 
 try:
-    from Cython.Distutils.build_ext import new_build_ext as build_ext
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
 except ImportError:
-    from setuptools.command.build_ext import build_ext
+    from setuptools.command.build_ext import build_ext as _build_ext
 
-import versioneer
 from distutils.sysconfig import get_python_lib
 
+import setuptools.command.build_ext
+from setuptools import find_packages, setup, Command
+from setuptools.extension import Extension
+
+from setuputils import use_raft_package, get_environment_option
+
+import versioneer
+
 
 INSTALL_REQUIRES = ['numba', 'cython']
+CYTHON_FILES = ['cugraph/**/*.pyx']
+
+UCX_HOME = get_environment_option("UCX_HOME")
+CUDA_HOME = get_environment_option('CUDA_HOME')
+CONDA_PREFIX = get_environment_option('CONDA_PREFIX')
 
 conda_lib_dir = os.path.normpath(sys.prefix) + '/lib'
 conda_include_dir = os.path.normpath(sys.prefix) + '/include'
 
-CYTHON_FILES = ['cugraph/**/*.pyx']
+if CONDA_PREFIX:
+    conda_include_dir = CONDA_PREFIX + '/include'
+    conda_lib_dir = CONDA_PREFIX + '/lib'
+
+if not UCX_HOME:
+    UCX_HOME = CONDA_PREFIX if CONDA_PREFIX else os.sys.prefix
+
+ucx_include_dir = os.path.join(UCX_HOME, "include")
+ucx_lib_dir = os.path.join(UCX_HOME, "lib")
 
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
 if not CUDA_HOME:
     path_to_cuda_gdb = shutil.which("cuda-gdb")
     if path_to_cuda_gdb is None:
@@ -53,11 +75,7 @@
     )
 
 cuda_include_dir = os.path.join(CUDA_HOME, "include")
-
-if (os.environ.get('CONDA_PREFIX', None)):
-    conda_prefix = os.environ.get('CONDA_PREFIX')
-    conda_include_dir = conda_prefix + '/include'
-    conda_lib_dir = conda_prefix + '/lib'
+cuda_lib_dir = os.path.join(CUDA_HOME, "lib64")
 
 # Optional location of C++ build folder that can be configured by the user
 libcugraph_path = get_environment_option('CUGRAPH_BUILD_PATH')
@@ -65,8 +83,38 @@
 raft_path = get_environment_option('RAFT_PATH')
 
 # FIXME: This could clone RAFT, even if it's not needed (eg. running --clean).
-raft_include_dir = use_raft_package(raft_path, libcugraph_path,
-                                    git_info_file='../cpp/CMakeLists.txt')
+# deprecated: This functionality will go away after
+# https://github.com/rapidsai/raft/issues/83
+raft_include_dir = use_raft_package(raft_path, libcugraph_path)
+
+if not libcugraph_path:
+    libcugraph_path = conda_lib_dir
+
+extensions = [
+    Extension("*",
+              sources=CYTHON_FILES,
+              include_dirs=[
+                  conda_include_dir,
+                  ucx_include_dir,
+                  '../cpp/include',
+                  "../thirdparty/cub",
+                  raft_include_dir,
+                  os.path.join(conda_include_dir, "libcudacxx"),
+                  cuda_include_dir,
+                  os.path.dirname(sysconfig.get_path("include"))
+              ],
+              library_dirs=[
+                  get_python_lib(),
+                  conda_lib_dir,
+                  libcugraph_path,
+                  ucx_lib_dir,
+                  cuda_lib_dir,
+                  os.path.join(os.sys.prefix, "lib")
+              ],
+              libraries=['cudart', 'cusparse', 'cusolver', 'cugraph', 'nccl'],
+              language='c++',
+              extra_compile_args=['-std=c++17'])
+]
 
 
 class CleanCommand(Command):
@@ -92,33 +140,50 @@ def run(self):
         os.system('find . -name "*.cpython*.so" -type f -delete')
 
 
+class build_ext_no_debug(_build_ext):
+
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
+
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
+
+
 cmdclass = dict()
 cmdclass.update(versioneer.get_cmdclass())
-cmdclass["build_ext"] = build_ext
+cmdclass["build_ext"] = build_ext_no_debug
 cmdclass["clean"] = CleanCommand
 
-EXTENSIONS = [
-    Extension("*",
-              sources=CYTHON_FILES,
-              include_dirs=[conda_include_dir,
-                            '../cpp/include',
-                            "../thirdparty/cub",
-                            raft_include_dir,
-                            os.path.join(
-                                conda_include_dir, "libcudf", "libcudacxx"),
-                            cuda_include_dir],
-              library_dirs=[get_python_lib()],
-              runtime_library_dirs=[conda_lib_dir],
-              libraries=['cugraph', 'cudf', 'nccl'],
-              language='c++',
-              extra_compile_args=['-std=c++14'])
-]
-
-for e in EXTENSIONS:
-    e.cython_directives = dict(
-        profile=False, language_level=3, embedsignature=True
-    )
-
 setup(name='cugraph',
       description="cuGraph - GPU Graph Analytics",
       version=versioneer.get_version(),
@@ -132,8 +197,8 @@ def run(self):
       ],
       # Include the separately-compiled shared library
       author="NVIDIA Corporation",
-      setup_requires=['cython'],
-      ext_modules=EXTENSIONS,
+      setup_requires=['Cython>=0.29,<0.30'],
+      ext_modules=extensions,
       packages=find_packages(include=['cugraph', 'cugraph.*']),
       install_requires=INSTALL_REQUIRES,
       license="Apache",
diff --git a/python/setuputils.py b/python/setuputils.py
index 360526c2b56..a64e60e1c32 100644
--- a/python/setuputils.py
+++ b/python/setuputils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 import shutil
 import subprocess
 import sys
-import warnings
 
 from pathlib import Path
 
@@ -71,7 +70,7 @@ def clean_folder(path):
 
 
 def use_raft_package(raft_path, cpp_build_path,
-                     git_info_file='../cpp/cmake/Dependencies.cmake'):
+                     git_info_file=None):
     """
     Function to use the python code in RAFT in package.raft
 
@@ -79,8 +78,8 @@ def use_raft_package(raft_path, cpp_build_path,
         if you want to change RAFT location.
     - Uses RAFT located in $RAFT_PATH if $RAFT_PATH exists.
     - Otherwise it will look for RAFT in the libcugraph build folder,
-        located either in the default location ../cpp/build or in
-        $CUGRAPH_BUILD_PATH.
+        located either in the default locations ../cpp/build/raft,
+        ../cpp/build/_deps/raft-src, or in $CUGRAPH_BUILD_PATH.
     -Otherwise it will clone RAFT into _external_repositories.
         - Branch/git tag cloned is located in git_info_file in this case.
 
@@ -88,17 +87,25 @@ def use_raft_package(raft_path, cpp_build_path,
      -------
      raft_include_path: Str
          Path to the C++ include folder of RAFT
+
     """
     if os.path.isdir('cugraph/raft'):
         raft_path = os.path.realpath('cugraph/raft')
         # walk up two dirs from `python/raft`
         raft_path = os.path.join(raft_path, '..', '..')
         print("-- Using existing RAFT folder")
+    elif cpp_build_path and os.path.isdir(os.path.join(cpp_build_path,
+                                                       '_deps/raft-src')):
+        raft_path = os.path.join(cpp_build_path, '_deps/raft-src')
+        raft_path = os.path.realpath(raft_path)
+        print("-- Using existing RAFT folder in CPP build dir from cmake "
+              "FetchContent")
     elif cpp_build_path and os.path.isdir(os.path.join(cpp_build_path,
                                                        'raft/src/raft')):
         raft_path = os.path.join(cpp_build_path, 'raft/src/raft')
         raft_path = os.path.realpath(raft_path)
-        print("-- Using existing RAFT folder in CPP build dir")
+        print("-- Using existing RAFT folder in CPP build dir from cmake "
+              "ExternalProject")
     elif isinstance(raft_path, (str, os.PathLike)):
         print('-- Using RAFT_PATH argument')
     elif os.environ.get('RAFT_PATH', False) is not False:
@@ -131,10 +138,13 @@ def use_raft_package(raft_path, cpp_build_path,
 def clone_repo_if_needed(name, cpp_build_path=None,
                          git_info_file=None):
     if git_info_file is None:
-        git_info_file = _get_repo_path() + '/cpp/CMakeLists.txt'
+        git_info_file = \
+            _get_repo_path() + '/cpp/cmake/thirdparty/get_{}.cmake'.format(
+                name
+            )
 
     if cpp_build_path is None or cpp_build_path is False:
-        cpp_build_path = _get_repo_path() + '/cpp/build/'
+        cpp_build_path = _get_repo_path() + '/cpp/build/_deps/'
 
     repo_cloned = get_submodule_dependency(name,
                                            cpp_build_path=cpp_build_path,
@@ -144,7 +154,7 @@ def clone_repo_if_needed(name, cpp_build_path=None,
         repo_path = (
             _get_repo_path() + '/python/_external_repositories/' + name + '/')
     else:
-        repo_path = os.path.join(cpp_build_path, name + '/src/' + name + '/')
+        repo_path = os.path.join(cpp_build_path, name + '-src/')
 
     return repo_path, repo_cloned
 
@@ -184,7 +194,7 @@ def get_submodule_dependency(repo,
 
     repo_info = get_repo_cmake_info(repos, git_info_file)
 
-    if os.path.exists(cpp_build_path):
+    if os.path.exists(os.path.join(cpp_build_path, repos[0] + '-src/')):
         print("-- Third party modules found succesfully in the libcugraph++ "
               "build folder.")
 
@@ -192,11 +202,11 @@ def get_submodule_dependency(repo,
 
     else:
 
-        warnings.warn("-- Third party repositories have not been found so they"
-                      "will be cloned. To avoid this set the environment "
-                      "variable CUGRAPH_BUILD_PATH, containing the relative "
-                      "path of the root of the repository to the folder "
-                      "where libcugraph++ was built.")
+        print("-- Third party repositories have not been found so they"
+              "will be cloned. To avoid this set the environment "
+              "variable CUGRAPH_BUILD_PATH, containing the relative "
+              "path of the root of the repository to the folder "
+              "where libcugraph++ was built.")
 
         for repo in repos:
             clone_repo(repo, repo_info[repo][0], repo_info[repo][1])
@@ -266,20 +276,30 @@ def get_repo_cmake_info(names, file_path):
         specified by cmake.
 
     """
-    with open(file_path, encoding='utf-8') as f:
+    with open(file_path) as f:
         s = f.read()
 
     results = {}
 
     for name in names:
-        res = re.findall(r'ExternalProject_Add\(' + re.escape(name)
-                         + '\s.*GIT_REPOSITORY.*\s.*GIT_TAG.*',  # noqa: W605
-                         s)
-
-        res = re.sub(' +', ' ', res[0])
-        res = res.split(' ')
-        res = [res[2][:-1], res[4]]
-        results[name] = res
+        repo = re.findall(r'\s.*GIT_REPOSITORY.*', s)
+        repo = repo[-1].split()[-1]
+        fork = re.findall(r'\s.*FORK.*', s)
+        fork = fork[-1].split()[-1]
+        repo = repo.replace("${PKG_FORK}", fork)
+        tag = re.findall(r'\s.*PINNED_TAG.*', s)
+        tag = tag[-1].split()[-1]
+        results[name] = [repo, tag]
+        if tag == 'branch-${CUGRAPH_BRANCH_VERSION_raft}':
+            loc = _get_repo_path() + '/cpp/CMakeLists.txt'
+            with open(loc) as f:
+                cmakelists = f.read()
+                tag = re.findall(r'\s.*project\(CUGRAPH VERSION.*', cmakelists)
+                print(tag)
+                tag = tag[-1].split()[2].split('.')
+                tag = 'branch-{}.{}'.format(tag[0], tag[1])
+
+        results[name] = [repo, tag]
 
     return results
 
diff --git a/python/utils/ECG_Golden.ipynb b/python/utils/ECG_Golden.ipynb
deleted file mode 100644
index 0da04869d78..00000000000
--- a/python/utils/ECG_Golden.ipynb
+++ /dev/null
@@ -1,487 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook was used to generate the golden data results for ECG. It requires that the python-igraph package be installed to run. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from scipy.io import mmread\n",
-    "import networkx as nx\n",
-    "#mmFile='/datasets/kron_g500-logn21/kron_g500-logn21.mtx'\n",
-    "mmFile='/datasets/golden_data/graphs/dblp.mtx'\n",
-    "#mmFile='/datasets/networks/karate.mtx'\n",
-    "#mmFile='/home/jwyles/code/mycugraph/datasets/dolphins.mtx'\n",
-    "#mmFile='/home/jwyles/code/mycugraph/datasets/netscience.mtx'\n",
-    "M = mmread(mmFile).asfptype()\n",
-    "import cugraph\n",
-    "import cudf\n",
-    "import numpy as np\n",
-    "rows = cudf.Series(M.row)\n",
-    "cols = cudf.Series(M.col)\n",
-    "values = cudf.Series(M.data)\n",
-    "G = cugraph.Graph()\n",
-    "G.add_edge_list(rows, cols, values)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 326 ms, sys: 400 ms, total: 726 ms\n",
-      "Wall time: 796 ms\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "parts = cugraph.ecg(G, .05, 16)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "49204"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "numParts = parts['partition'].max() + 1\n",
-    "numParts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.850147008895874"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mod = cugraph.analyzeClustering_modularity(G, numParts, parts['partition'])\n",
-    "mod"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.7506256512679915"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "parts2, mod2 = cugraph.louvain(G)\n",
-    "mod2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import igraph as ig\n",
-    "import numpy as np\n",
-    "\n",
-    "def community_ecg(self, weights=None, ens_size=16, min_weight=0.05):\n",
-    "    W = [0]*self.ecount()\n",
-    "    ## Ensemble of level-1 Louvain \n",
-    "    for i in range(ens_size):\n",
-    "        p = np.random.permutation(self.vcount()).tolist()\n",
-    "        g = self.permute_vertices(p)\n",
-    "        l = g.community_multilevel(weights=weights, return_levels=True)[0].membership\n",
-    "        b = [l[p[x.tuple[0]]]==l[p[x.tuple[1]]] for x in self.es]\n",
-    "        W = [W[i]+b[i] for i in range(len(W))]\n",
-    "    W = [min_weight + (1-min_weight)*W[i]/ens_size for i in range(len(W))]\n",
-    "    part = self.community_multilevel(weights=W)\n",
-    "    ## Force min_weight outside 2-core\n",
-    "    core = self.shell_index()\n",
-    "    ecore = [min(core[x.tuple[0]],core[x.tuple[1]]) for x in self.es]\n",
-    "    part.W = [W[i] if ecore[i]>1 else min_weight for i in range(len(ecore))]\n",
-    "    return part\n",
-    "\n",
-    "ig.Graph.community_ecg = community_ecg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Gi = ig.Graph.Read_Edgelist('./dblp2.txt', directed=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 3min 49s, sys: 1.67 s, total: 3min 51s\n",
-      "Wall time: 3min 50s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "ec = Gi.community_ecg()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ecg = np.zeros(len(Gi.vs), dtype=np.int32)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([0, 0, 0, ..., 0, 0, 0], dtype=int32)"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ecg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i in range(len(ec)):\n",
-    "    for j in ec[i]:\n",
-    "        ecg[j] = i"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([275, 275,   0, ..., 435, 435, 107], dtype=int32)"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ecg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ecg_col = cudf.Series(ecg)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "numParts = ecg_col.max() + 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod4 = cugraph.analyzeClustering_modularity(G, numParts, ecg_col)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.9279554486274719"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mod4"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "34"
-      ]
-     },
-     "execution_count": 94,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "maxId = 0\n",
-    "for i in range(len(ec)):\n",
-    "    for j in ec[i]:\n",
-    "        if j > maxId:\n",
-    "            maxId = j\n",
-    "maxId"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "156"
-      ]
-     },
-     "execution_count": 79,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(Gi.es)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "156"
-      ]
-     },
-     "execution_count": 80,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(M.row)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "156"
-      ]
-     },
-     "execution_count": 81,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "78 *2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "filename = \"dblp2.txt\"\n",
-    "f = open(filename, 'w')\n",
-    "for i in range(len(M.row)):\n",
-    "    f.write(str(M.row[i]) + ' ' + str(M.col[i]) + '\\n')\n",
-    "f.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "igraph.Edge(<igraph.Graph object at 0x7fb7f439cd68>, 1, {})"
-      ]
-     },
-     "execution_count": 82,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Gi.es[1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "igraph.Edge(<igraph.Graph object at 0x7fb7f439cd68>, 1, {})"
-      ]
-     },
-     "execution_count": 84,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Gi.es.select()[1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 85,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Gi.es[0].source"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 88,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Gi.es[0].target"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/thirdparty/LICENSES/LICENSE.boost b/thirdparty/LICENSES/LICENSE.boost
new file mode 100644
index 00000000000..36b7cd93cdf
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.boost
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/thirdparty/LICENSES/LICENSE.texas_state_university b/thirdparty/LICENSES/LICENSE.texas_state_university
new file mode 100644
index 00000000000..7862557ac87
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.texas_state_university
@@ -0,0 +1,24 @@
+Copyright (c) 2014-2020, Texas State University. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+   * Neither the name of Texas State University nor the names of its
+     contributors may be used to endorse or promote products derived from
+     this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL TEXAS STATE UNIVERSITY BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.