diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 1ad516d5c..f38400f90 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -73,12 +73,14 @@ struct window_probing_results { * @tparam KeyEqual Binary callable type used to compare two keys for equality * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) * @tparam StorageRef Storage ref type + * @tparam AllowsDuplicates Flag indicating whether duplicate keys are allowed or not */ template + typename StorageRef, + bool AllowsDuplicates> class open_addressing_ref_impl { static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); @@ -94,6 +96,9 @@ class open_addressing_ref_impl { /// Determines if the container is a key/value or key-only store static constexpr auto has_payload = not std::is_same_v; + /// Flag indicating whether duplicate keys are allowed or not + static constexpr auto allows_duplicates = AllowsDuplicates; + // TODO: how to re-enable this check? // static_assert(is_window_extent_v, // "Extent is not a valid cuco::window_extent"); @@ -360,8 +365,10 @@ class open_addressing_ref_impl { for (auto& slot_content : window_slots) { auto const eq_res = this->predicate_(this->extract_key(slot_content), key); - // If the key is already in the container, return false - if (eq_res == detail::equal_result::EQUAL) { return false; } + if constexpr (not allows_duplicates) { + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return false; } + } if (eq_res == detail::equal_result::EMPTY or cuco::detail::bitwise_compare(this->extract_key(slot_content), this->erased_key_sentinel())) { @@ -369,9 +376,15 @@ class open_addressing_ref_impl { switch (attempt_insert((storage_ref_.data() + *probing_iter)->data() + intra_window_index, slot_content, val)) { + case insert_result::DUPLICATE: { + if constexpr (allows_duplicates) { + [[fallthrough]]; + } else { + return false; + } + } case insert_result::CONTINUE: continue; case insert_result::SUCCESS: return true; - case insert_result::DUPLICATE: return false; } } } @@ -405,8 +418,13 @@ class open_addressing_ref_impl { switch (this->predicate_(this->extract_key(window_slots[i]), key)) { case detail::equal_result::EMPTY: return window_probing_results{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: - return window_probing_results{detail::equal_result::EQUAL, i}; + case detail::equal_result::EQUAL: { + if constexpr (allows_duplicates) { + continue; + } else { + return window_probing_results{detail::equal_result::EQUAL, i}; + } + } default: { if (cuco::detail::bitwise_compare(this->extract_key(window_slots[i]), this->erased_key_sentinel())) { @@ -421,8 +439,10 @@ class open_addressing_ref_impl { return window_probing_results{detail::equal_result::UNEQUAL, -1}; }(); - // If the key is already in the container, return false - if (group.any(state == detail::equal_result::EQUAL)) { return false; } + if constexpr (not allows_duplicates) { + // If the key is already in the container, return false + if (group.any(state == detail::equal_result::EQUAL)) { return false; } + } auto const group_contains_available = group.ballot(state == detail::equal_result::EMPTY or state == detail::equal_result::ERASED); @@ -437,7 +457,13 @@ class open_addressing_ref_impl { switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; - case insert_result::DUPLICATE: return false; + case insert_result::DUPLICATE: { + if constexpr (allows_duplicates) { + [[fallthrough]]; + } else { + return false; + } + } default: continue; } } else { diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl new file mode 100644 index 000000000..174f9bcbe --- /dev/null +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace cuco { + +template +constexpr static_multiset:: + static_multiset(Extent capacity, + empty_key empty_key_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + Storage, + Allocator const& alloc, + cuda_stream_ref stream) + : impl_{std::make_unique( + capacity, empty_key_sentinel, pred, probing_scheme, alloc, stream)} +{ +} + +template +constexpr static_multiset:: + static_multiset(Extent n, + double desired_load_factor, + empty_key empty_key_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + Storage, + Allocator const& alloc, + cuda_stream_ref stream) + : impl_{std::make_unique( + n, desired_load_factor, empty_key_sentinel, pred, probing_scheme, alloc, stream)} +{ +} + +template +constexpr static_multiset:: + static_multiset(Extent capacity, + empty_key empty_key_sentinel, + erased_key erased_key_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + Storage, + Allocator const& alloc, + cuda_stream_ref stream) + : impl_{std::make_unique( + capacity, empty_key_sentinel, erased_key_sentinel, pred, probing_scheme, alloc, stream)} +{ +} + +template +void static_multiset::clear( + cuda_stream_ref stream) noexcept +{ + impl_->clear(stream); +} + +template +void static_multiset::clear_async( + cuda_stream_ref stream) noexcept +{ + impl_->clear_async(stream); +} + +template +template +void static_multiset::insert( + InputIt first, InputIt last, cuda_stream_ref stream) +{ + this->insert_async(first, last, stream); + stream.synchronize(); +} + +template +template +void static_multiset::insert_async( + InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + impl_->insert_async(first, last, ref(op::insert), stream); +} + +template +template +void static_multiset::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) +{ + this->insert_if_async(first, last, stencil, pred, stream); + stream.synchronize(); +} + +template +template +void static_multiset:: + insert_if_async( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept +{ + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); +} + +template +static_multiset::size_type +static_multiset::size( + cuda_stream_ref stream) const noexcept +{ + return impl_->size(stream); +} + +template +constexpr auto +static_multiset::capacity() + const noexcept +{ + return impl_->capacity(); +} + +template +constexpr static_multiset::key_type +static_multiset:: + empty_key_sentinel() const noexcept +{ + return impl_->empty_key_sentinel(); +} + +template +constexpr static_multiset::key_type +static_multiset:: + erased_key_sentinel() const noexcept +{ + return impl_->erased_key_sentinel(); +} + +template +template +auto static_multiset::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return cuco::detail::bitwise_compare(this->empty_key_sentinel(), this->erased_key_sentinel()) + ? ref_type{cuco::empty_key(this->empty_key_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + cuda_thread_scope{}, + impl_->storage_ref()} + : ref_type{cuco::empty_key(this->empty_key_sentinel()), + cuco::erased_key(this->erased_key_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + cuda_thread_scope{}, + impl_->storage_ref()}; +} +} // namespace cuco diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl new file mode 100644 index 000000000..3c44735a4 --- /dev/null +++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cuco { + +template +__host__ __device__ constexpr static_multiset_ref< + Key, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_multiset_ref(cuco::empty_key empty_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + StorageRef storage_ref) noexcept + : impl_{empty_key_sentinel, predicate, probing_scheme, storage_ref} +{ +} + +template +__host__ __device__ constexpr static_multiset_ref< + Key, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_multiset_ref(cuco::empty_key empty_key_sentinel, + cuco::erased_key erased_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + StorageRef storage_ref) noexcept + : impl_{empty_key_sentinel, erased_key_sentinel, predicate, probing_scheme, storage_ref} +{ +} + +template +template +__host__ __device__ constexpr static_multiset_ref:: + static_multiset_ref( + static_multiset_ref&& + other) noexcept + : impl_{std::move(other.impl_)} +{ +} + +template +__host__ __device__ constexpr static_multiset_ref::key_equal +static_multiset_ref::key_eq() + const noexcept +{ + return this->impl_.key_eq(); +} + +template +__host__ __device__ constexpr auto +static_multiset_ref::capacity() + const noexcept +{ + return impl_.capacity(); +} + +template +__host__ __device__ constexpr static_multiset_ref::extent_type +static_multiset_ref::window_extent() + const noexcept +{ + return impl_.window_extent(); +} + +template +__host__ __device__ constexpr Key +static_multiset_ref:: + empty_key_sentinel() const noexcept +{ + return impl_.empty_key_sentinel(); +} + +template +__host__ __device__ constexpr Key +static_multiset_ref:: + erased_key_sentinel() const noexcept +{ + return impl_.erased_key_sentinel(); +} + +template +__host__ __device__ constexpr static_multiset_ref::const_iterator +static_multiset_ref::end() + const noexcept +{ + return this->impl_.end(); +} + +template +__host__ __device__ constexpr static_multiset_ref::iterator +static_multiset_ref::end() noexcept +{ + return this->impl_.end(); +} + +template +template +auto static_multiset_ref::with( + NewOperators...) && noexcept +{ + return static_multiset_ref{ + std::move(*this)}; +} + +namespace detail { + +template +class operator_impl< + op::insert_tag, + static_multiset_ref> { + using base_type = static_multiset_ref; + using ref_type = + static_multiset_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Inserts an element. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(Value const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert(value); + } + + /** + * @brief Inserts an element. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + Value const& value) noexcept + { + auto& ref_ = static_cast(*this); + return ref_.impl_.insert(group, value); + } +}; + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 48b53ecf5..953507a6a 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -67,8 +67,12 @@ class static_map_ref : public detail::operator_impl< Operators, static_map_ref>... { - using impl_type = - detail::open_addressing_ref_impl; + /// Flag indicating whether duplicate keys are allowed or not + static constexpr auto allows_duplicates = false; + + /// Implementation type + using impl_type = detail:: + open_addressing_ref_impl; static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh new file mode 100644 index 000000000..b4a684bcc --- /dev/null +++ b/include/cuco/static_multiset.cuh @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuco { +/** + * @brief A GPU-accelerated, unordered, associative container of possibly non-unique objects + * + * The `static_multiset` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the set. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data structures always place the slot keys on the left-hand side when + * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive + * `KeyEqual` should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = cuco::double_hashing<4, // CG size + cuco::default_hash_function>, + class Allocator = cuco::cuda_allocator, + class Storage = cuco::storage<1>> +class static_multiset { + using impl_type = detail:: + open_addressing_impl; + + public: + static constexpr auto cg_size = impl_type::cg_size; ///< CG size used for probing + static constexpr auto window_size = impl_type::window_size; ///< Window size used for probing + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + using key_type = typename impl_type::key_type; ///< Key type + using value_type = typename impl_type::value_type; ///< Key type + using extent_type = typename impl_type::extent_type; ///< Extent type + using size_type = typename impl_type::size_type; ///< Size type + using key_equal = typename impl_type::key_equal; ///< Key equality comparator type + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + /// Non-owning window storage ref type + using storage_ref_type = typename impl_type::storage_ref_type; + using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + + template + using ref_type = cuco::static_multiset_ref; ///< Non-owning container ref type + + static_multiset(static_multiset const&) = delete; + static_multiset& operator=(static_multiset const&) = delete; + + static_multiset(static_multiset&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current multiset object + */ + static_multiset& operator=(static_multiset&&) = default; + ~static_multiset() = default; + + /** + * @brief Constructs a statically-sized multiset with the specified initial capacity, sentinel + * values and CUDA stream + * + * The actual multiset capacity depends on the given `capacity`, the probing scheme, CG size, and + * the window size and it is computed via the `make_window_extent` factory. Insert operations will + * not automatically grow the set. Attempting to insert more unique keys than the capacity of the + * multiset results in undefined behavior. + * + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note This constructor doesn't synchronize the given stream. + * + * @param capacity The requested lower-bound multiset size + * @param empty_key_sentinel The reserved key value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the set + */ + constexpr static_multiset(Extent capacity, + empty_key empty_key_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda_stream_ref stream = {}); + + /** + * @brief Constructs a statically-sized multiset with the number of elements to insert `n`, the + * desired load factor, etc + * + * @note This constructor helps users create a set based on the number of elements to insert and + * the desired load factor without manually computing the desired capacity. The actual set + * capacity will be a size no smaller than `ceil(n / desired_load_factor)`. It's determined by + * multiple factors including the given `n`, the desired load factor, the probing scheme, the CG + * size, and the window size and is computed via the `make_window_extent` factory. + * @note Insert operations will not automatically grow the container. + * @note Attempting to insert more unique keys than the capacity of the container results in + * undefined behavior. + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note This constructor doesn't synchronize the given stream. + * @note This overload will convert compile-time extents to runtime constants which might lead to + * performance regressions. + * + * @throw If the desired occupancy is no bigger than zero + * @throw If the desired occupancy is no smaller than one + * + * @param n The number of elements to insert + * @param desired_load_factor The desired load factor of the container, e.g., 0.5 implies a 50% + * load factor + * @param empty_key_sentinel The reserved key value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the set + */ + constexpr static_multiset(Extent n, + double desired_load_factor, + empty_key empty_key_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda_stream_ref stream = {}); + + /** + * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it is computed via the `make_window_extent` factory. Insert operations will not + * automatically grow the set. Attempting to insert more unique keys than the capacity of the + * multiset results in undefined behavior. + * + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the + * stream before the object is first used. + * + * @param capacity The requested lower-bound set size + * @param empty_key_sentinel The reserved key value for empty slots + * @param erased_key_sentinel The reserved key to denote erased slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the set + */ + constexpr static_multiset(Extent capacity, + empty_key empty_key_sentinel, + erased_key erased_key_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda_stream_ref stream = {}); + + /** + * @brief Erases all elements from the container. After this call, `size()` returns zero. + * Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns + * zero. Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear_async(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts all keys in the range `[first, last)` + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * // TODO: to be revised due to heterogeneous lookup + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_multiset::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously inserts all keys in the range `[first, last)`. + * + * // TODO: to be revised due to heterogeneous lookup + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_multiset::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ + [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Gets the maximum number of elements the multiset can hold. + * + * @return The maximum number of elements the multiset can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an erased key slot. + * + * @return The sentinel value used to represent an erased key slot + */ + [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_multiset` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + std::unique_ptr impl_; +}; +} // namespace cuco + +#include diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh new file mode 100644 index 000000000..975ca915b --- /dev/null +++ b/include/cuco/static_multiset_ref.cuh @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuco { + +/** + * @brief Device non-owning "ref" of `static_multiset` that can be used in device code to perform + * arbitrary operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data structures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_multiset_ref + : public detail::operator_impl< + Operators, + static_multiset_ref>... { + /// Flag indicating whether duplicate keys are allowed or not + static constexpr auto allows_duplicates = true; + + /// Implementation type + using impl_type = detail:: + open_addressing_ref_impl; + + public: + using key_type = Key; ///< Key Type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + /** + * @brief Constructs static_multiset_ref + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_multiset_ref( + cuco::empty_key empty_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope scope, + StorageRef storage_ref) noexcept; + + /** + * @brief Constructs static_multiset_ref + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param erased_key_sentinel Sentinel indicating erased key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_multiset_ref( + cuco::empty_key empty_key_sentinel, + cuco::erased_key erased_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope scope, + StorageRef storage_ref) noexcept; + + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_multiset_ref( + static_multiset_ref&& + other) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the window extent of the current storage. + * + * @return The window extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an erased key slot. + * + * @return The sentinel value used to represent an erased key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type erased_key_sentinel() const noexcept; + + /** + * @brief Gets the key comparator. + * + * @return The comparator used to compare keys + */ + [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept; + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept; + + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + + private: + impl_type impl_; + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_multiset_ref; +}; + +} // namespace cuco + +#include diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index 39a1e5f3b..f2f661190 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -64,8 +64,12 @@ class static_set_ref : public detail::operator_impl< Operators, static_set_ref>... { - using impl_type = - detail::open_addressing_ref_impl; + /// Flag indicating whether duplicate keys are allowed or not + static constexpr auto allows_duplicates = false; + + /// Implementation type + using impl_type = detail:: + open_addressing_ref_impl; public: using key_type = Key; ///< Key Type diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e09efddb3..f77a45178 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -90,6 +90,11 @@ ConfigureTest(DYNAMIC_MAP_TEST dynamic_map/unique_sequence_test.cu dynamic_map/erase_test.cu) +################################################################################################### +# - static_multiset tests ------------------------------------------------------------------------- +ConfigureTest(STATIC_MULTISET_TEST + static_multiset/insert_test.cu) + ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- ConfigureTest(STATIC_MULTIMAP_TEST diff --git a/tests/static_multiset/insert_test.cu b/tests/static_multiset/insert_test.cu new file mode 100644 index 000000000..2342c6849 --- /dev/null +++ b/tests/static_multiset/insert_test.cu @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +#include + +using size_type = int32_t; + +template +__inline__ void test_insert(Set& set) +{ + using Key = typename Set::key_type; + + auto constexpr num = 300; + + SECTION("Inserting 300 unique keys should get 300 entries in the multiset") + { + auto const keys = thrust::counting_iterator{0}; + set.insert(keys, keys + num); + auto const num_insertions = set.size(); + + REQUIRE(num_insertions == num); + } + + SECTION("Inserting one key for 300 times should get 300 entries in the multiset") + { + auto const keys = thrust::constant_iterator{0}; + set.insert(keys, keys + num); + auto const num_insertions = set.size(); + + REQUIRE(num_insertions == num); + } + + auto const is_even = + cuda::proclaim_return_type([] __device__(size_type const& i) { return i % 2 == 0; }); + + SECTION("Inserting all even values between [0, 300) should get 150 entries in the multiset") + { + auto const keys = thrust::counting_iterator{0}; + set.insert_if(keys, keys + num, keys, is_even); + auto const num_insertions = set.size(); + + REQUIRE(num_insertions == num / 2); + } + + SECTION("Conditionally inserting one key for 150 times should get 150 entries in the multiset") + { + auto const keys = thrust::constant_iterator{0}; + set.insert_if(keys, keys + num, thrust::counting_iterator{0}, is_even); + auto const num_insertions = set.size(); + + REQUIRE(num_insertions == num / 2); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_multiset insert tests", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + constexpr size_type gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412; // 103 x 2 x 2 + + using probe = std::conditional_t>, + cuco::double_hashing>>; + + auto set = + cuco::static_multiset{num_keys, cuco::empty_key{-1}, {}, probe{}, {}, cuco::storage<2>{}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_insert(set); +}