rapidsai
diff --git a/‎conda/environments/all_cuda-129_arch-aarch64.yaml‎
Lines changed: 1 addition & 0 deletions b/‎conda/environments/all_cuda-129_arch-aarch64.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎conda/environments/all_cuda-129_arch-x86_64.yaml‎
Lines changed: 1 addition & 0 deletions b/‎conda/environments/all_cuda-129_arch-x86_64.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎conda/environments/all_cuda-130_arch-aarch64.yaml‎
Lines changed: 1 addition & 0 deletions b/‎conda/environments/all_cuda-130_arch-aarch64.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎conda/environments/all_cuda-130_arch-x86_64.yaml‎
Lines changed: 1 addition & 0 deletions b/‎conda/environments/all_cuda-130_arch-x86_64.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/benchmarks/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions b/‎cpp/benchmarks/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cpp/benchmarks/io/parquet/experimental/parquet_deletion_vectors.cpp‎
Lines changed: 255 additions & 0 deletions b/‎cpp/benchmarks/io/parquet/experimental/parquet_deletion_vectors.cpp‎
Lines changed: 255 additions & 0 deletions
diff --git a/‎cpp/cmake/thirdparty/get_croaring.cmake‎
Lines changed: 47 additions & 0 deletions b/‎cpp/cmake/thirdparty/get_croaring.cmake‎
Lines changed: 47 additions & 0 deletions
@@ -74,6 +74,7 @@ dependencies:
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
+- pytest-httpserver
 - pytest-rerunfailures!=16.0.0
 - pytest-xdist
 - python-confluent-kafka>=2.8.0,<2.9.0a0
 
@@ -75,6 +75,7 @@ dependencies:
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
+- pytest-httpserver
 - pytest-rerunfailures!=16.0.0
 - pytest-xdist
 - python-confluent-kafka>=2.8.0,<2.9.0a0
 
@@ -74,6 +74,7 @@ dependencies:
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
+- pytest-httpserver
 - pytest-rerunfailures!=16.0.0
 - pytest-xdist
 - python-confluent-kafka>=2.8.0,<2.9.0a0
 
@@ -75,6 +75,7 @@ dependencies:
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
+- pytest-httpserver
 - pytest-rerunfailures!=16.0.0
 - pytest-xdist
 - python-confluent-kafka>=2.8.0,<2.9.0a0
 
@@ -292,6 +292,9 @@ include(cmake/thirdparty/get_cccl.cmake)
 # find rmm
 include(cmake/thirdparty/get_rmm.cmake)
 
+# find croaring
+include(cmake/thirdparty/get_croaring.cmake)
+
 # find flatbuffers
 include(cmake/thirdparty/get_flatbuffers.cmake)
 
@@ -536,6 +539,7 @@ add_library(
   src/io/parquet/compact_protocol_writer.cpp
   src/io/parquet/decode_preprocess.cu
   src/io/parquet/experimental/dictionary_page_filter.cu
+  src/io/parquet/experimental/deletion_vectors.cu
   src/io/parquet/experimental/hybrid_scan.cpp
   src/io/parquet/experimental/hybrid_scan_chunking.cu
   src/io/parquet/experimental/hybrid_scan_helpers.cpp
 
@@ -329,6 +329,19 @@ ConfigureNVBench(
   PARQUET_EXPERIMENTAL_READER_NVBENCH io/parquet/experimental/parquet_dictionary_page_filter.cpp
 )
 
+# ##################################################################################################
+# * parquet deletion vector benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(
+  PARQUET_DELETION_VECTORS_NVBENCH io/parquet/experimental/parquet_deletion_vectors.cpp
+)
+target_compile_definitions(
+  PARQUET_DELETION_VECTORS_NVBENCH
+  PRIVATE DISABLENEON=1 ROARING_DISABLE_X64=1 ROARING_DISABLE_AVX=1
+          CROARING_COMPILER_SUPPORTS_AVX512=0
+)
+target_link_libraries(PARQUET_DELETION_VECTORS_NVBENCH PRIVATE roaring)
+
 # ##################################################################################################
 # * parquet multithread reader benchmark
 # ----------------------------------------------------------------------
 
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/io/experimental/deletion_vectors.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+#include <roaring/roaring64.h>
+
+#include <random>
+
+namespace {
+/**
+ * @brief Serializes a roaring64 bitmap to a vector of cuda::std::byte
+ *
+ * @param deletion_vector Pointer to the roaring64 bitmap to serialize
+ *
+ * @return Host vector of bytes containing the serialized roaring64 bitmap
+ */
+auto serialize_roaring_bitmap(roaring64_bitmap_t const* roaring_bitmap)
+{
+  auto const num_bytes = roaring64_bitmap_portable_size_in_bytes(roaring_bitmap);
+  CUDF_EXPECTS(num_bytes > 0, "Roaring64 bitmap is empty");
+  auto serialized_bitmap = thrust::host_vector<cuda::std::byte>(num_bytes);
+  std::ignore            = roaring64_bitmap_portable_serialize(
+    roaring_bitmap, reinterpret_cast<char*>(serialized_bitmap.data()));
+  return serialized_bitmap;
+}
+
+/**
+ * @brief Builds a host vector of expected row indices from the specified row group offsets and
+ * row counts
+ *
+ * @param row_group_offsets Row group offsets
+ * @param row_group_num_rows Number of rows in each row group
+ * @param num_rows Total number of table rows
+ *
+ * @return Host vector of expected row indices
+ */
+auto build_row_indices(cudf::host_span<size_t const> row_group_offsets,
+                       cudf::host_span<cudf::size_type const> row_group_num_rows,
+                       cudf::size_type num_rows)
+{
+  auto const num_row_groups = static_cast<cudf::size_type>(row_group_num_rows.size());
+
+  // Row group span offsets
+  auto row_group_span_offsets = thrust::host_vector<cudf::size_type>(num_row_groups + 1);
+  row_group_span_offsets[0]   = 0;
+  thrust::inclusive_scan(
+    row_group_num_rows.begin(), row_group_num_rows.end(), row_group_span_offsets.begin() + 1);
+
+  // Expected row indices data
+  auto expected_row_indices = thrust::host_vector<size_t>(num_rows);
+  std::fill(expected_row_indices.begin(), expected_row_indices.end(), 1);
+
+  // Scatter row group row offsets to expected row indices
+  thrust::scatter(row_group_offsets.begin(),
+                  row_group_offsets.end(),
+                  row_group_span_offsets.begin(),
+                  expected_row_indices.begin());
+
+  // Inclusive scan to compute the rest of the expected row indices
+  std::for_each(
+    thrust::counting_iterator(0), thrust::counting_iterator(num_row_groups), [&](auto i) {
+      auto start_row_index = row_group_span_offsets[i];
+      auto end_row_index   = row_group_span_offsets[i + 1];
+      thrust::inclusive_scan(expected_row_indices.begin() + start_row_index,
+                             expected_row_indices.begin() + end_row_index,
+                             expected_row_indices.begin() + start_row_index);
+    });
+
+  return expected_row_indices;
+}
+
+/**
+ * @brief Builds a roaring64 deletion vector and a (host) row mask vector based on the specified
+ * probability of a row being deleted
+ *
+ * @param row_group_offsets Row group row offsets
+ * @param row_group_num_rows Number of rows in each row group
+ * @param num_rows Number of rows in the table
+ * @param deletion_probability The probability of a row being deleted
+ *
+ * @return Serialized roaring64 bitmap buffer
+ */
+auto build_deletion_vector(cudf::host_span<size_t const> row_group_offsets,
+                           cudf::host_span<cudf::size_type const> row_group_num_rows,
+                           cudf::size_type num_rows,
+                           float deletion_probability)
+{
+  std::mt19937 engine{0xbaLL};
+  std::bernoulli_distribution dist(deletion_probability);
+
+  auto row_indices = build_row_indices(row_group_offsets, row_group_num_rows, num_rows);
+
+  CUDF_EXPECTS(std::cmp_equal(row_indices.size(), num_rows),
+               "Row indices vector must have the same number of rows as the table");
+
+  auto input_row_mask = thrust::host_vector<bool>(num_rows);
+  std::generate(input_row_mask.begin(), input_row_mask.end(), [&]() { return dist(engine); });
+
+  auto deletion_vector = roaring64_bitmap_create();
+
+  // Context for the roaring64 bitmap for faster (bulk) add operations
+  auto roaring64_context =
+    roaring64_bulk_context_t{.high_bytes = {0, 0, 0, 0, 0, 0}, .leaf = nullptr};
+
+  std::for_each(thrust::counting_iterator<size_t>(0),
+                thrust::counting_iterator<size_t>(num_rows),
+                [&](auto row_idx) {
+                  // Insert provided host row index if the row is deleted in the row mask
+                  if (not input_row_mask[row_idx]) {
+                    roaring64_bitmap_add_bulk(
+                      deletion_vector, &roaring64_context, row_indices[row_idx]);
+                  }
+                });
+
+  return serialize_roaring_bitmap(deletion_vector);
+}
+
+auto setup_table_and_deletion_vector(nvbench::state& state)
+{
+  auto const num_columns = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const rows_per_row_group =
+    static_cast<cudf::size_type>(state.get_int64("rows_per_row_group"));
+  auto const num_row_groups       = static_cast<cudf::size_type>(state.get_int64("num_row_groups"));
+  auto const deletion_probability = static_cast<float>(state.get_float64("deletion_probability"));
+  auto const source_type          = retrieve_io_type_enum(state.get_string("io_type"));
+  auto const num_rows             = rows_per_row_group * num_row_groups;
+
+  cuio_source_sink_pair source_sink(source_type);
+
+  // Create a table and write it to parquet sink
+  {
+    auto const d_types = std::vector<cudf::type_id>{
+      cudf::type_id::FLOAT64,
+      cudf::type_id::DURATION_MICROSECONDS,
+      cudf::type_id::TIMESTAMP_MILLISECONDS,
+      cudf::type_id::STRING,
+    };
+
+    auto const table = create_random_table(cycle_dtypes(d_types, num_columns),
+                                           row_count{num_rows},
+                                           data_profile_builder().null_probability(0.10),
+                                           0xbad);
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), table->view())
+        .row_group_size_rows(rows_per_row_group)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+  }
+
+  // Row offsets for each row group - arbitrary, only used to build the index column
+  auto row_group_offsets = thrust::host_vector<size_t>(num_row_groups);
+  row_group_offsets[0]   = static_cast<size_t>(std::llround(2e9));
+  std::for_each(
+    thrust::counting_iterator<size_t>(1),
+    thrust::counting_iterator<size_t>(num_row_groups),
+    [&](auto i) { row_group_offsets[i] = std::llround(row_group_offsets[i - 1] + 0.5e9); });
+
+  // Row group splits
+  auto row_group_splits = thrust::host_vector<cudf::size_type>(num_row_groups - 1);
+  {
+    std::mt19937 engine{0xf00d};
+    std::uniform_int_distribution<cudf::size_type> dist{1, num_rows};
+    std::generate(row_group_splits.begin(), row_group_splits.end(), [&]() { return dist(engine); });
+    std::sort(row_group_splits.begin(), row_group_splits.end());
+  }
+
+  // Number of rows in each row group
+  auto row_group_num_rows = thrust::host_vector<cudf::size_type>{};
+  {
+    row_group_num_rows.reserve(num_row_groups);
+    auto previous_split = cudf::size_type{0};
+    std::transform(row_group_splits.begin(),
+                   row_group_splits.end(),
+                   std::back_inserter(row_group_num_rows),
+                   [&](auto current_split) {
+                     auto current_split_size = current_split - previous_split;
+                     previous_split          = current_split;
+                     return current_split_size;
+                   });
+    row_group_num_rows.push_back(num_rows - row_group_splits.back());
+  }
+
+  auto deletion_vector =
+    build_deletion_vector(row_group_offsets, row_group_num_rows, num_rows, deletion_probability);
+
+  return std::tuple{std::move(source_sink),
+                    std::move(row_group_offsets),
+                    std::move(row_group_num_rows),
+                    std::move(deletion_vector)};
+}
+
+}  // namespace
+
+void BM_parquet_deletion_vectors(nvbench::state& state)
+{
+  auto const num_row_groups = static_cast<cudf::size_type>(state.get_int64("num_row_groups"));
+  auto const rows_per_row_group =
+    static_cast<cudf::size_type>(state.get_int64("rows_per_row_group"));
+  auto const num_rows = rows_per_row_group * num_row_groups;
+
+  auto [source_sink, row_group_offsets, row_group_num_rows, deletion_vector] =
+    setup_table_and_deletion_vector(state);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+
+      timer.start();
+      auto const result = cudf::io::parquet::experimental::read_parquet_and_apply_deletion_vector(
+        read_opts, deletion_vector, row_group_offsets, row_group_num_rows);
+      timer.stop();
+    });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / time, "rows_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+NVBENCH_BENCH(BM_parquet_deletion_vectors)
+  .set_name("parquet_deletion_vectors")
+  .set_min_samples(4)
+  .add_int64_power_of_two_axis("num_row_groups", nvbench::range(4, 14, 2))
+  .add_int64_axis("rows_per_row_group", {5'000, 10'000})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .add_float64_axis("deletion_probability", {0.15, 0.5, 0.75})
+  .add_int64_axis("num_cols", {4});
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to clone CRoaring and set up the necessary targets and include directories.
+function(find_and_configure_roaring VERSION)
+  rapids_cpm_find(
+    roaring ${VERSION}
+    GLOBAL_TARGETS roaring
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/RoaringBitmap/CRoaring.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE
+    OPTIONS "ROARING_BUILD_STATIC ON"
+            "BUILD_SHARED_LIBS OFF"
+            "ENABLE_ROARING_TESTS OFF"
+            "ENABLE_ROARING_MICROBENCHMARKS OFF"
+            "ROARING_DISABLE_NEON ON"
+            "ROARING_DISABLE_X64 ON"
+            "ROARING_DISABLE_AVX2 ON"
+            "ROARING_DISABLE_AVX512 ON"
+  )
+  if(roaring_ADDED)
+    set_target_properties(roaring PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  endif()
+
+  if(DEFINED roaring_SOURCE_DIR)
+    set(roaring_INCLUDE_DIR
+        "${roaring_SOURCE_DIR}"
+        PARENT_SCOPE
+    )
+  endif()
+
+endfunction()
+
+set(roaring_VERSION_cudf "4.3.11")
+find_and_configure_roaring(${roaring_VERSION_cudf})