Add libcudf example with large strings (rapidsai#15983)

Creating an example that shows reading large strings columns. This uses the 1 billion row challenge input data and provides three examples of loading this data: - `brc` uses the CSV reader to load the input file in one call and aggregates the results using `groupby` - `brc_chunks` uses the CSV reader to load the input file in chunks, aggregates each chunk, and computes the results - `brc_pipeline` same as `brc_chunks` but input chunks are processed in separate threads/streams. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Gregory Kimball (https://github.com/GregoryKimball) - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: rapidsai#15983
Matt711 · Sep 5, 2024 · 715677e · 715677e
1 parent 0e86f62
commit 715677e
Show file tree

Hide file tree

Showing 9 changed files with 674 additions and 0 deletions.
diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(billion_rows)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  billion_rows
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+
+add_library(groupby_results OBJECT groupby_results.cpp)
+target_link_libraries(groupby_results PRIVATE cudf::cudf)
+
+add_executable(brc brc.cpp)
+target_link_libraries(brc PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc DESTINATION bin/examples/libcudf)
+
+add_executable(brc_chunks brc_chunks.cpp)
+target_link_libraries(brc_chunks PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc_chunks DESTINATION bin/examples/libcudf)
+
+add_executable(brc_pipeline brc_pipeline.cpp)
+target_link_libraries(brc_pipeline PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/billion_rows/README.md b/cpp/examples/billion_rows/README.md
@@ -0,0 +1,44 @@
+# libcudf C++ example for the 1 billion row challenge
+
+This C++ example demonstrates using libcudf APIs to read and process
+a table with 1 billion rows. The 1 billion row challenge is described here:
+https://github.com/gunnarmorling/1brc
+
+The examples load the 1 billion row text file using the CSV reader.
+The file contains around 400 unique city names (string type) along with
+random temperature values (float type).
+Once loaded, the examples performs groupby aggregations to find the
+minimum, maximum, and average temperature for each city.
+
+There are three examples included:
+1. `brc.cpp`
+   Loads the file in one call to the CSV reader.
+   This generally requires a large amount of available GPU memory.
+2. `brc_chunks.cpp`
+   Loads and processes the file in chunks.
+   The number of chunks to use is a parameter to the executable.
+3. `brc_pipeline.cpp`
+   Loads and processes the file in chunks with separate threads/streams.
+   The number of chunks and number of threads to use are parameters to the executable.
+
+An input file can be generated using the instructions from
+https://github.com/gunnarmorling/1brc.
+
+## Compile and execute
+
+```bash
+# Configure project
+cmake -S . -B build/
+# Build
+cmake --build build/ --parallel $PARALLEL_LEVEL
+# Execute
+build/brc input.txt
+# Execute in chunked mode with 25 chunks (default)
+build/brc_chunks input.txt 25
+# Execute in pipeline mode with 25 chunks and 2 threads (defaults)
+build/brc_pipeline input.txt 25 2
+```
+
+If your machine does not come with a pre-built libcudf binary, expect the
+first build to take some time, as it would build libcudf on the host machine.
+It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
diff --git a/cpp/examples/billion_rows/brc.cpp b/cpp/examples/billion_rows/brc.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t = std::chrono::duration<double>;
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    return 1;
+  }
+
+  auto const input_file = std::string{argv[1]};
+  std::cout << "Input: " << input_file << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  auto start = std::chrono::steady_clock::now();
+
+  auto const csv_result = [input_file, stream] {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+        .header(-1)
+        .delimiter(';')
+        .doublequote(false)
+        .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                             cudf::data_type{cudf::type_id::FLOAT32}})
+        .na_filter(false);
+    return cudf::io::read_csv(in_opts, stream).tbl;
+  }();
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "File load time: " << elapsed.count() << " seconds\n";
+  auto const csv_table = csv_result->view();
+  std::cout << "Input rows: " << csv_table.num_rows() << std::endl;
+
+  auto const cities = csv_table.column(0);
+  auto const temps  = csv_table.column(1);
+
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+
+  auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+  // The other 2 examples employ sorting for the sub-aggregates so enabling
+  // the following line may be more comparable in performance with them.
+  //
+  // result      = cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream);
+
+  stream.synchronize();
+
+  elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << result->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/brc_chunks.cpp b/cpp/examples/billion_rows/brc_chunks.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t = std::chrono::duration<double>;
+
+std::unique_ptr<cudf::table> load_chunk(std::string const& input_file,
+                                        std::size_t start,
+                                        std::size_t size,
+                                        rmm::cuda_stream_view stream)
+{
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+      .header(-1)
+      .delimiter(';')
+      .doublequote(false)
+      .byte_range_offset(start)
+      .byte_range_size(size)
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                           cudf::data_type{cudf::type_id::FLOAT32}})
+      .na_filter(false);
+  return cudf::io::read_csv(in_opts, stream).tbl;
+}
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    std::cout << "optional parameter: chunk-count\n";
+    return 1;
+  }
+
+  auto const input_file = std::string{argv[1]};
+  auto const divider    = (argc < 3) ? 25 : std::stoi(std::string(argv[2]));
+
+  std::cout << "Input: " << input_file << std::endl;
+  std::cout << "Chunks: " << divider << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  std::filesystem::path p = input_file;
+  auto const file_size    = std::filesystem::file_size(p);
+
+  auto start = std::chrono::steady_clock::now();
+
+  std::vector<std::unique_ptr<cudf::table>> agg_data;
+  std::size_t chunk_size     = file_size / divider + ((file_size % divider) != 0);
+  std::size_t start_pos      = 0;
+  cudf::size_type total_rows = 0;
+  do {
+    auto const input_table = load_chunk(input_file, start_pos, chunk_size, stream);
+    auto const read_rows   = input_table->num_rows();
+    if (read_rows == 0) break;
+
+    auto const cities = input_table->view().column(0);
+    auto const temps  = input_table->view().column(1);
+
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+    auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+    agg_data.emplace_back(
+      cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream));
+    start_pos += chunk_size;
+    chunk_size = std::min(chunk_size, file_size - start_pos);
+    total_rows += read_rows;
+  } while (start_pos < file_size && chunk_size > 0);
+
+  // now aggregate the aggregate results
+  auto results = compute_final_aggregates(agg_data, stream);
+  stream.synchronize();
+
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << results->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}