From 267515d105c96ab4cec6626092f7350bc9edb16f Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Tue, 2 Apr 2024 18:37:55 +0300 Subject: [PATCH] Fix SYMBOL_LIST Segment index out of bounds bug The check `row < row_count()` in `string_at` was problematic for symbol list entries because they have the exception that it is possible for a column to have more rows than segment rows. That exception is because of the nature of the symbol list refactor in 4.2.0 which needed to remain backwards compatible with old readers. This change bypasses the check for symbol list entries and directly calls string_at or scalar_at of the Column. --- .../column_store/memory_segment_impl.cpp | 1 + cpp/arcticdb/version/symbol_list.cpp | 30 +++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index 8f8acf55df..22382fcbbd 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -564,6 +564,7 @@ std::optional SegmentInMemoryImpl::string_at(position_t row, p const auto& col_ref = column(col); if(is_fixed_string_type(td.data_type()) && col_ref.is_inflated()) { + auto string_size = col_ref.bytes() / row_count(); auto ptr = col_ref.data().buffer().ptr_cast(row * string_size, string_size); return std::string_view(ptr, string_size); diff --git a/cpp/arcticdb/version/symbol_list.cpp b/cpp/arcticdb/version/symbol_list.cpp index 6298a76c60..a4ae5de32e 100644 --- a/cpp/arcticdb/version/symbol_list.cpp +++ b/cpp/arcticdb/version/symbol_list.cpp @@ -128,17 +128,35 @@ MaybeCompaction last_compaction(const std::vector& keys) { } } +// The below string_at and scalar_at functions should be used for symbol list cache segments instead of the ones +// provided in SegmentInMemory, because the symbol list structure is the only place where columns can have more entries +// than the segment has rows. Hence, we need to bypass the checks inside SegmentInMemory's function and directly call the +// Column's string_at and scalar_at. +std::string string_at(const SegmentInMemory& seg, position_t row, position_t col){ + auto offset = seg.column(col).scalar_at(row); + util::check(offset.has_value(), "Symbol list trying to call string_at for missing row {}, column {}", row, col); + return std::string(seg.string_pool_ptr()->get_view(offset.value())); +} + +template +T scalar_at(const SegmentInMemory& seg, position_t row, position_t col){ + auto scalar = seg.column(col).scalar_at(row); + util::check(scalar.has_value(), "Symbol list trying to call scalar_at for missing row {}, column {}", row, col); + return scalar.value(); +} + + StreamId stream_id_from_segment( DataType data_type, const SegmentInMemory& seg, position_t row_id, position_t column) { if (data_type == DataType::UINT64) { - auto num_id = seg.scalar_at(row_id, column).value(); + auto num_id = scalar_at(seg, row_id, column); ARCTICDB_DEBUG(log::symbol(), "Reading numeric symbol {}", num_id); return safe_convert_to_numeric_id(num_id); } else { - auto sym = std::string(seg.string_at(row_id, column).value()); + auto sym = string_at(seg, row_id, column); ARCTICDB_DEBUG(log::symbol(), "Reading string symbol '{}'", sym); return {std::move(sym)}; } @@ -183,8 +201,8 @@ std::vector read_new_style_list_from_storage(const SegmentInMem for(auto i = 0L; i < seg.column(0).row_count(); ++i) { auto stream_id = stream_id_from_segment(data_type, seg, i, 0); - auto reference_id = VersionId{seg.scalar_at(i, 1).value()}; - auto reference_time = timestamp{seg.scalar_at(i, 2).value()}; + auto reference_id = VersionId{scalar_at(seg, i, 1)}; + auto reference_time = timestamp{scalar_at(seg, i, 2)}; ARCTICDB_RUNTIME_DEBUG(log::symbol(), "Reading added symbol {}: {}@{}", stream_id, reference_id, reference_time); output.emplace_back(stream_id, reference_id, reference_time, ActionType::ADD); } @@ -195,8 +213,8 @@ std::vector read_new_style_list_from_storage(const SegmentInMem for (auto i = 0L; i < seg.column(3).row_count(); ++i) { auto stream_id = stream_id_from_segment(data_type, seg, i, 3); - auto reference_id = VersionId{seg.scalar_at(i, 4).value()}; - auto reference_time = timestamp{seg.scalar_at(i, 5).value()}; + auto reference_id = VersionId{scalar_at(seg, i, 4)}; + auto reference_time = timestamp{scalar_at(seg, i, 5)}; ARCTICDB_RUNTIME_DEBUG(log::symbol(), "Reading deleted symbol {}: {}@{}", stream_id, reference_id, reference_time); output.emplace_back(stream_id, reference_id, reference_time, ActionType::DELETE); }