Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 40 additions & 7 deletions bolt/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1548,15 +1548,48 @@ class ParquetRowReader::Impl {
}

uint64_t skip(uint64_t skipSize) {
auto rowsToSkip = nextReadSize(skipSize);
if (rowsToSkip == kAtEnd) {
return 0;
uint64_t totalSkipped = 0;

// First, consume rows remaining in the currently loaded row group, if any.
if (skipSize > 0 && currentRowInGroup_ < rowsInCurrentRowGroup_) {
const auto rowsToSkip = std::min<uint64_t>(
skipSize, rowsInCurrentRowGroup_ - currentRowInGroup_);
columnReader_->setReadOffset(columnReader_->readOffset() + rowsToSkip);
currentRowInGroup_ += rowsToSkip;
totalSkipped += rowsToSkip;
skipSize -= rowsToSkip;
}

BOLT_DCHECK_GT(rowsToSkip, 0);
columnReader_->setReadOffset(columnReader_->readOffset() + rowsToSkip);
currentRowInGroup_ += rowsToSkip;
return rowsToSkip;
// Then, skip over whole row groups purely using footer metadata without
// loading them. When a row group can be fully skipped, just advance the
// index instead of scheduling/loading it. When skip lands inside a row
// group, advance to it (which schedules/loads it) and adjust the in-group
// read offset.
while (skipSize > 0 && nextRowGroupIdsIdx_ < rowGroupIds_.size()) {
const auto nextRowGroupIndex = rowGroupIds_[nextRowGroupIdsIdx_];
const auto rowsInNextGroup = rowGroups_[nextRowGroupIndex].num_rows;

if (skipSize >= rowsInNextGroup) {
// Skip the whole row group by advancing the index; do not schedule or
// load it.
skipSize -= rowsInNextGroup;
totalSkipped += rowsInNextGroup;
++nextRowGroupIdsIdx_;
continue;
}

// Landing inside this row group: load it now and seek to the in-group
// offset.
if (!advanceToNextRowGroup()) {
break;
}
columnReader_->setReadOffset(columnReader_->readOffset() + skipSize);
currentRowInGroup_ = skipSize;
totalSkipped += skipSize;
skipSize = 0;
break;
}
return totalSkipped;
}

uint64_t next(
Expand Down
9 changes: 9 additions & 0 deletions bolt/dwio/parquet/tests/reader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,12 @@ if(${BOLT_ENABLE_ARROW})
)

endif()

if(${BOLT_BUILD_BENCHMARKS})
add_executable(bolt_dwio_parquet_reader_skip_benchmark ParquetReaderSkipBenchmark.cpp)
target_link_libraries(
bolt_dwio_parquet_reader_skip_benchmark
PRIVATE bolt_testutils
${FOLLY_BENCHMARK}
)
endif()
Loading