From fea87cb85cb2aa1dd2dd1c767bc56c256a734659 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 12:39:24 -0700 Subject: [PATCH] Move `flatten_single_pass_aggs` to its own TU (#17053) Part of splitting the original bulk shared memory groupby PR https://github.com/rapidsai/cudf/pull/16619. This PR separates `flatten_single_pass_aggs` into its own translation unit without making any code modifications. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Shruti Shivakumar (https://github.com/shrshi) - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17053 --- cpp/CMakeLists.txt | 1 + .../groupby/hash/flatten_single_pass_aggs.cpp | 139 ++++++++++++++++++ .../groupby/hash/flatten_single_pass_aggs.hpp | 33 +++++ cpp/src/groupby/hash/groupby.cu | 105 +------------ 4 files changed, 174 insertions(+), 104 deletions(-) create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.cpp create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f7a5dd2f2fb..a0ea9579475 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -315,6 +315,7 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp new file mode 100644 index 00000000000..b2048a9fbb8 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flatten_single_pass_aggs.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +class groupby_simple_aggregations_collector final + : public cudf::detail::simple_aggregations_collector { + public: + using cudf::detail::simple_aggregations_collector::visit; + + std::vector> visit(data_type col_type, + cudf::detail::min_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() + : make_min_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::max_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() + : make_max_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::mean_aggregation const&) override + { + (void)col_type; + CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::var_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::std_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } +}; + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests) +{ + std::vector columns; + std::vector> aggs; + std::vector agg_kinds; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + + std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; + + auto values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto&& agg : agg_v) { + groupby_simple_aggregations_collector collector; + + for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { + insert_agg(request.values, std::move(agg_s)); + } + } + } + + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp new file mode 100644 index 00000000000..2bf983e5e90 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f9a80a048b5..75767786272 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "flatten_single_pass_aggs.hpp" #include "groupby/common/utils.hpp" #include "groupby/hash/groupby_kernels.cuh" @@ -110,76 +111,6 @@ bool constexpr is_hash_aggregation(aggregation::Kind t) return array_contains(hash_aggregations, t); } -class groupby_simple_aggregations_collector final - : public cudf::detail::simple_aggregations_collector { - public: - using cudf::detail::simple_aggregations_collector::visit; - - std::vector> visit(data_type col_type, - cudf::detail::min_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() - : make_min_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::max_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() - : make_max_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::mean_aggregation const&) override - { - (void)col_type; - CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::var_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::std_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } -}; - template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; @@ -347,40 +278,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final dense_results->add_result(col, agg, std::move(result)); } }; -// flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) -{ - std::vector columns; - std::vector> aggs; - std::vector agg_kinds; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - - std::unordered_set agg_kinds_set; - auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - } - }; - - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto&& agg : agg_v) { - groupby_simple_aggregations_collector collector; - - for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(request.values, std::move(agg_s)); - } - } - } - - return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); -} /** * @brief Gather sparse results into dense using `gather_map` and add to