Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement batch construction for strings columns #17035

Open
wants to merge 40 commits into
base: branch-24.12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e252fb4
Complete `make_strings_column_batch`
ttnghia Oct 9, 2024
11d09ef
Update tests
ttnghia Oct 10, 2024
7e253f2
Update benchmark
ttnghia Oct 10, 2024
0f22d4b
Update docs
ttnghia Oct 10, 2024
326d73e
Rewrite `valid_if_n_kernel`
ttnghia Oct 10, 2024
d314669
Cleanup
ttnghia Oct 10, 2024
7453ef4
Fix tests
ttnghia Oct 10, 2024
ee5b7fd
Discard null mask if there is no nulls
ttnghia Oct 10, 2024
629516b
Change benchmark
ttnghia Oct 10, 2024
27d474d
Debugging performance
ttnghia Oct 10, 2024
6f7c6f3
Cleanup and fix `valid_if_batch_async`
ttnghia Oct 10, 2024
99107f6
Combine function
ttnghia Oct 10, 2024
42391fb
Implement `make_chars` using `cub::DeviceMemcpy::Batched`
ttnghia Oct 11, 2024
9e23b41
Move the implementation of `make_chars` to `strings_column_factories.…
ttnghia Oct 11, 2024
f8957f1
Revert unrelated changes
ttnghia Oct 11, 2024
239a816
Merge branch 'branch-24.12' into strings_batch_construction
ttnghia Oct 11, 2024
cfadc8d
Rewrite tests
ttnghia Oct 11, 2024
f13ab82
Fix null
ttnghia Oct 11, 2024
de1a716
Fix stream, and rename function
ttnghia Oct 11, 2024
22d6dac
Use `string_index_pair`
ttnghia Oct 11, 2024
0d95af5
Rewrite benchmark using range based for loop
ttnghia Oct 11, 2024
05155a3
Remove unused var
ttnghia Oct 11, 2024
7d4ed69
Disable test by default since it needs very much memory
ttnghia Oct 11, 2024
de5ebab
Change tests
ttnghia Oct 11, 2024
3f55a20
Add stream test
ttnghia Oct 11, 2024
c66c2d2
Change from using `make_host_vector_async` to `make_std_vector_async`
ttnghia Oct 11, 2024
03c3e1a
Using `DISABLED_` prefix for test
ttnghia Oct 11, 2024
60f7e3a
Add `CUDF_CUDA_TRY`
ttnghia Oct 11, 2024
6dccdc7
Change comments
ttnghia Oct 11, 2024
8e190ad
Fix spell
ttnghia Oct 11, 2024
c981e99
Test stream sync
ttnghia Oct 12, 2024
8dfa8dd
Revert "Test stream sync"
ttnghia Oct 12, 2024
4190cfd
Test disabling kvikio
ttnghia Oct 13, 2024
8fd05d1
Revert "Test disabling kvikio"
ttnghia Oct 14, 2024
a1a36c6
Merge branch 'branch-24.12' into strings_batch_construction
ttnghia Oct 14, 2024
b73240b
Move `make_chars_buffer` to `strings_children.cuh`
ttnghia Oct 14, 2024
d5dfa31
Merge branch 'branch-24.12' into strings_batch_construction
ttnghia Oct 14, 2024
3ec7b5a
Fix header with `device_memcpy.cuh`
ttnghia Oct 14, 2024
6d43f4c
Fix headers
ttnghia Oct 16, 2024
7764849
Merge branch 'branch-24.12' into strings_batch_construction
ttnghia Oct 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ ConfigureNVBench(
string/join_strings.cpp
string/lengths.cpp
string/like.cpp
string/make_strings_column.cu
string/replace_re.cpp
string/reverse.cpp
string/slice.cpp
Expand Down
100 changes: 100 additions & 0 deletions cpp/benchmarks/string/make_strings_column.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/pair.h>
#include <thrust/tabulate.h>

#include <nvbench/nvbench.cuh>

#include <vector>

namespace {

constexpr int min_row_width = 0;
constexpr int max_row_width = 50;

using string_index_pair = thrust::pair<char const*, cudf::size_type>;

template <bool batch_construction>
std::vector<std::unique_ptr<cudf::column>> make_strings_columns(
std::vector<cudf::device_span<string_index_pair const>> const& input,
rmm::cuda_stream_view stream)
{
if constexpr (batch_construction) {
return cudf::make_strings_column_batch(input, stream);
} else {
std::vector<std::unique_ptr<cudf::column>> output;
output.reserve(input.size());
for (auto const& column_input : input) {
output.emplace_back(cudf::make_strings_column(column_input, stream));
}
return output;
}
}

} // namespace

static void BM_make_strings_column_batch(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
auto const has_nulls = true;

data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
.null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
auto const data_table = create_random_table(
cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);

auto const stream = cudf::get_default_stream();
auto input_data = std::vector<rmm::device_uvector<string_index_pair>>{};
auto input = std::vector<cudf::device_span<string_index_pair const>>{};
input_data.reserve(batch_size);
input.reserve(batch_size);
for (auto const& cv : data_table->view()) {
auto const d_data_ptr = cudf::column_device_view::create(cv, stream);
auto batch_input = rmm::device_uvector<string_index_pair>(cv.size(), stream);
thrust::tabulate(rmm::exec_policy(stream),
batch_input.begin(),
batch_input.end(),
[data_col = *d_data_ptr] __device__(auto const idx) {
if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; }
auto const row = data_col.element<cudf::string_view>(idx);
return string_index_pair{row.data(), row.size_bytes()};
});
input_data.emplace_back(std::move(batch_input));
input.emplace_back(input_data.back());
}

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto const output = make_strings_columns<true>(input, stream);
});
}

NVBENCH_BENCH(BM_make_strings_column_batch)
.set_name("make_strings_column_batch")
.add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000})
.add_int64_axis("batch_size", {10, 20, 50, 100});
20 changes: 20 additions & 0 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,26 @@ std::unique_ptr<column> make_strings_column(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a batch of STRING type columns given an array of device spans of pointer/size
* pairs.
*
* This function has input/output expectation similar to the `make_strings_column()` API that
* accepts only one device span of pointer/size pairs. The difference is that, this is designed to
* create many strings columns at once with minimal overhead of multiple kernel launches and
* stream synchronizations.
*
* @param input Array of device spans of pointer/size pairs, where each pointer is a device memory
* address or `nullptr` (indicating a null string), and size is string length (in bytes)
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for memory allocation of the output columns
* @return Array of constructed strings columns
*/
std::vector<std::unique_ptr<column>> make_strings_column_batch(
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a STRING type column given a device span of string_view.
*
Expand Down
58 changes: 58 additions & 0 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
Expand All @@ -29,6 +30,8 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cub/device/device_memcpy.cuh>
#include <cuda/functional>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

Expand All @@ -38,6 +41,61 @@ namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Gather characters to create a strings column using the given string-index pair iterator
*
* @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
*
* @param offsets The offsets for the output strings column
* @param chars_size The size (in bytes) of the chars data
* @param begin Iterator to the first string-index pair
* @param strings_count The number of strings
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return An array of chars gathered from the input string-index pair iterator
*/
template <typename IndexPairIterator>
rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
int64_t chars_size,
IndexPairIterator begin,
size_type strings_count,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto chars_data = rmm::device_uvector<char>(chars_size, stream, mr);
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);

auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
// Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
// we have to use `const_cast` to remove `const` qualifier from the source pointer.
// This should be fine as long as we only read but not write anything to the source.
return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
}));
auto const src_sizes = cudf::detail::make_counting_transform_iterator(
0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
return begin[idx].second;
}));
auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
0u,
cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
uint32_t idx) { return output + offsets[idx]; }));

size_t temp_storage_bytes = 0;
CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(
nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value()));
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(),
temp_storage_bytes,
src_ptrs,
dst_ptrs,
src_sizes,
strings_count,
stream.value()));

return chars_data;
}

/**
* @brief Create an offsets column to be a child of a compound column
*
Expand Down
46 changes: 2 additions & 44 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,6 @@ namespace detail {
*/
using string_index_pair = thrust::pair<char const*, size_type>;

/**
* @brief Average string byte-length threshold for deciding character-level
* vs. row-level parallel algorithm.
*
* This value was determined by running the factory_benchmark against different
* string lengths and observing the point where the performance is faster for
* long strings.
*/
constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;

/**
* @brief Create a strings-type column from iterators of pointer/size pairs
*
Expand Down Expand Up @@ -88,8 +78,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// create null mask
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
Expand All @@ -99,38 +87,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
(null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};

// build chars column
auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
// use a character-parallel kernel for long string lengths
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const str_begin = thrust::make_transform_iterator(
begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
return string_view{ip.first, ip.second};
}));

return gather_chars(str_begin,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
d_offsets,
bytes,
stream,
mr);
} else {
// this approach is 2-3x faster for a large number of smaller string lengths
auto chars_data = rmm::device_uvector<char>(bytes, stream, mr);
auto d_chars = chars_data.data();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair const str = thrust::get<0>(item);
int64_t const offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
strings_count,
copy_chars);
return chars_data;
}
}();
auto chars_data =
make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this could replace gather_chars in

auto out_chars_data = gather_chars(

Out of scope for this PR I think. I can make a note to look into this in a follow on PR.


return make_strings_column(strings_count,
std::move(offsets_column),
Expand Down
Loading
Loading