-
Notifications
You must be signed in to change notification settings - Fork 727
JSON subcolumns store change from string to binary #27653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c65004e
47fc60b
838a371
ead5b58
d9e9c68
a221b54
c3b383a
476c801
e6b1258
d93e1e6
4499f62
b4fd59e
e954dea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -70,8 +70,18 @@ class TSparsedArrayChunk { | |
| TSparsedArrayChunk(TSparsedArrayChunk&&) = default; | ||
|
|
||
| void VisitValues(const IChunkedArray::TValuesSimpleVisitor& visitor) const { | ||
| visitor(ColValue); | ||
| visitor(DefaultsArray); | ||
| ui32 prevIndex = 0; | ||
| for (ui32 idx = 0; idx < UI32ColIndex->length(); ++idx) { | ||
| auto currentIndex = UI32ColIndex->Value(idx); | ||
| for (ui32 i = prevIndex; i < currentIndex; ++i) { | ||
| visitor(DefaultsArray); | ||
| } | ||
| visitor(ColValue->Slice(idx, 1)); | ||
| prevIndex = currentIndex + 1; | ||
| } | ||
| for (; prevIndex < RecordsCount; ++prevIndex) { | ||
| visitor(DefaultsArray); | ||
Vladilen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
Comment on lines
+73
to
+84
|
||
| } | ||
|
|
||
| ui32 GetFinishPosition() const { | ||
|
|
@@ -286,6 +296,10 @@ class TSparsedArray: public IChunkedArray { | |
| } | ||
| }; | ||
|
|
||
| static TSparsedBuilder<arrow::BinaryType> MakeBuilderBinary(const ui32 reserveItems = 0, const ui32 reserveData = 0) { | ||
| return TSparsedBuilder<arrow::BinaryType>(nullptr, reserveItems, reserveData); | ||
| } | ||
|
|
||
| static TSparsedBuilder<arrow::StringType> MakeBuilderUtf8(const ui32 reserveItems = 0, const ui32 reserveData = 0) { | ||
| return TSparsedBuilder<arrow::StringType>(nullptr, reserveItems, reserveData); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,8 @@ | |
|
|
||
| #include <ydb/core/formats/arrow/arrow_filter.h> | ||
|
|
||
| #include <yql/essentials/types/binary_json/read.h> | ||
|
|
||
| namespace NKikimr::NArrow::NAccessor::NSubColumns { | ||
| TColumnsData TColumnsData::Slice(const ui32 offset, const ui32 count) const { | ||
| auto records = Records->Slice(offset, count); | ||
|
|
@@ -61,8 +63,8 @@ void TColumnsData::TIterator::InitArrays() { | |
| } | ||
| const ui32 localIndex = FullArrayAddress->GetAddress().GetLocalIndex(CurrentIndex); | ||
| ChunkAddress = FullArrayAddress->GetArray()->GetChunk(ChunkAddress, localIndex); | ||
| AFL_VERIFY(ChunkAddress->GetArray()->type()->id() == arrow::utf8()->id()); | ||
| CurrentArrayData = static_cast<const arrow::StringArray*>(ChunkAddress->GetArray().get()); | ||
| AFL_VERIFY(ChunkAddress->GetArray()->type()->id() == arrow::binary()->id()); | ||
| CurrentArrayData = static_cast<const arrow::BinaryArray*>(ChunkAddress->GetArray().get()); | ||
| if (FullArrayAddress->GetArray()->GetType() == IChunkedArray::EType::Array) { | ||
| if (CurrentArrayData->IsNull(localIndex)) { | ||
| Next(); | ||
|
|
@@ -82,4 +84,15 @@ void TColumnsData::TIterator::InitArrays() { | |
| AFL_VERIFY(CurrentIndex <= GlobalChunkedArray->GetRecordsCount())("index", CurrentIndex)("count", GlobalChunkedArray->GetRecordsCount()); | ||
| } | ||
|
|
||
| NJson::TJsonValue TColumnsData::TIterator::GetValue() const { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Я кстати не уверен что NJson::TJsonValue это самый эффективный способ хранения json, можно будет поисследовать другие либы как они по префу. Это уже отдельно можно провернуть |
||
| auto view = CurrentArrayData->GetView(ChunkAddress->GetAddress().GetLocalIndex(CurrentIndex)); | ||
| if (view.empty()) { | ||
| return NJson::TJsonValue(NJson::JSON_UNDEFINED); | ||
| } | ||
| auto data = NBinaryJson::SerializeToJson(TStringBuf(view.data(), view.size())); | ||
| NJson::TJsonValue res; | ||
| AFL_VERIFY(NJson::ReadJsonTree(data, &res)); | ||
| return res; | ||
| } | ||
|
|
||
| } // namespace NKikimr::NArrow::NAccessor::NSubColumns | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -16,18 +16,20 @@ namespace NKikimr::NArrow::NAccessor::NSubColumns { | |||||||||||||
|
|
||||||||||||||
| void TColumnElements::BuildSparsedAccessor(const ui32 recordsCount) { | ||||||||||||||
| AFL_VERIFY(!Accessor); | ||||||||||||||
| auto recordsBuilder = TSparsedArray::MakeBuilderUtf8(RecordIndexes.size(), DataSize); | ||||||||||||||
| auto recordsBuilder = TSparsedArray::MakeBuilderBinary(RecordIndexes.size(), DataSize); | ||||||||||||||
| for (ui32 idx = 0; idx < RecordIndexes.size(); ++idx) { | ||||||||||||||
| recordsBuilder.AddRecord(RecordIndexes[idx], Values[idx]); | ||||||||||||||
| const auto& rec = Values[idx]; | ||||||||||||||
| recordsBuilder.AddRecord(RecordIndexes[idx], std::string_view(rec.Data(), rec.Size())); | ||||||||||||||
| } | ||||||||||||||
| Accessor = recordsBuilder.Finish(recordsCount); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| void TColumnElements::BuildPlainAccessor(const ui32 recordsCount) { | ||||||||||||||
| AFL_VERIFY(!Accessor); | ||||||||||||||
| auto builder = TTrivialArray::MakeBuilderUtf8(recordsCount, DataSize); | ||||||||||||||
| auto builder = TTrivialArray::MakeBuilderBinary(recordsCount, DataSize); | ||||||||||||||
| for (auto it = RecordIndexes.begin(); it != RecordIndexes.end(); ++it) { | ||||||||||||||
| builder.AddRecord(*it, Values[it - RecordIndexes.begin()]); | ||||||||||||||
| const auto& rec = Values[it - RecordIndexes.begin()]; | ||||||||||||||
| builder.AddRecord(*it, std::string_view(rec.Data(), rec.Size())); | ||||||||||||||
| } | ||||||||||||||
| Accessor = builder.Finish(recordsCount); | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -87,7 +89,7 @@ std::shared_ptr<TSubColumnsArray> TDataBuilder::Finish() { | |||||||||||||
|
|
||||||||||||||
| auto records = std::make_shared<TGeneralContainer>(CurrentRecordIndex); | ||||||||||||||
| for (auto&& i : columnElements) { | ||||||||||||||
| records->AddField(std::make_shared<arrow::Field>(std::string(i->GetKeyName()), arrow::utf8()), i->GetAccessorVerified()).Validate(); | ||||||||||||||
| records->AddField(std::make_shared<arrow::Field>(std::string(i->GetKeyName()), arrow::binary()), i->GetAccessorVerified()).Validate(); | ||||||||||||||
| } | ||||||||||||||
| TColumnsData cData(std::move(columnStats), std::move(records)); | ||||||||||||||
| return std::make_shared<TSubColumnsArray>(std::move(cData), std::move(rbOthers), Type, CurrentRecordIndex, Settings); | ||||||||||||||
|
|
@@ -105,7 +107,10 @@ TOthersData TDataBuilder::MergeOthers(const std::vector<TColumnElements*>& other | |||||||||||||
| auto othersBuilder = TOthersData::MakeMergedBuilder(); | ||||||||||||||
| while (heap.size()) { | ||||||||||||||
| std::pop_heap(heap.begin(), heap.end()); | ||||||||||||||
| othersBuilder->AddImpl(heap.back().GetRecordIndex(), heap.back().GetKeyIndex(), heap.back().GetValuePointer()); | ||||||||||||||
| std::string_view view = heap.back().GetValuePointer() ? | ||||||||||||||
| std::string_view(heap.back().GetValuePointer()->Data(), heap.back().GetValuePointer()->Size()) : ""; | ||||||||||||||
| std::string_view* viewPtr = heap.back().GetValuePointer() ? &view : nullptr; | ||||||||||||||
|
Comment on lines
+110
to
+112
|
||||||||||||||
| std::string_view view = heap.back().GetValuePointer() ? | |
| std::string_view(heap.back().GetValuePointer()->Data(), heap.back().GetValuePointer()->Size()) : ""; | |
| std::string_view* viewPtr = heap.back().GetValuePointer() ? &view : nullptr; | |
| auto* valuePtr = heap.back().GetValuePointer(); | |
| std::string_view view = valuePtr ? std::string_view(valuePtr->Data(), valuePtr->Size()) : ""; | |
| std::string_view* viewPtr = valuePtr ? &view : nullptr; |
Uh oh!
There was an error while loading. Please reload this page.