From 8e27af3dd5b940b81011f1da1a327e173ed615f4 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 12 Dec 2025 12:13:14 +0100 Subject: [PATCH 01/15] Add deserialization of ts_array, without timezone, date and time --- .../deserialize_primitive_array.hpp | 265 ++++++++++++++++++ include/sparrow_ipc/utils.hpp | 2 + src/deserialize.cpp | 129 +++++++++ src/flatbuffer_utils.cpp | 36 ++- src/utils.cpp | 10 + tests/test_de_serialization_with_files.cpp | 1 + 6 files changed, 439 insertions(+), 4 deletions(-) diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 6904f03..cc65b33 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -4,13 +4,278 @@ #include #include +#include #include +#include +#include +#include #include "Message_generated.h" #include "sparrow_ipc/deserialize_array_impl.hpp" namespace sparrow_ipc { + template + [[nodiscard]] sparrow::date_array deserialize_non_owning_date_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::date_array{std::move(ap)}; + } + + template + [[nodiscard]] sparrow::timestamp_array deserialize_non_owning_timestamp_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index, + const std::string& timezone + ) + { + std::string format = std::string(data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + )) + timezone; + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format.c_str(), + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::timestamp_array{std::move(ap)}; + } + + template + [[nodiscard]] sparrow::timestamp_without_timezone_array deserialize_non_owning_timestamp_without_timezone_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::timestamp_without_timezone_array{std::move(ap)}; + } + + template + [[nodiscard]] sparrow::time_array deserialize_non_owning_time_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::time_array{std::move(ap)}; + } + template [[nodiscard]] sparrow::primitive_array deserialize_non_owning_primitive_array( const org::apache::arrow::flatbuf::RecordBatch& record_batch, diff --git a/include/sparrow_ipc/utils.hpp b/include/sparrow_ipc/utils.hpp index 124305b..8502cbf 100644 --- a/include/sparrow_ipc/utils.hpp +++ b/include/sparrow_ipc/utils.hpp @@ -96,6 +96,8 @@ namespace sparrow_ipc::utils // Parse the format string // The format string is expected to be "w:size", "+w:size", "d:precision,scale", etc std::optional parse_format(std::string_view format_str, std::string_view sep); + // Used to parse timezone + std::optional parse_format_string(std::string_view format_str, std::string_view sep); // size_t calculate_output_serialized_size(const sparrow::record_batch& record_batch); /** diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 8e9f668..168cb7d 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -98,6 +98,56 @@ namespace sparrow_ipc buffer_index ); }; + + const auto deserialize_non_owning_date_array_lambda = [&]() + { + return deserialize_non_owning_date_array( + record_batch, + encapsulated_message.body(), + name, + metadata, + nullable, + buffer_index + ); + }; + + const auto deserialize_non_owning_timestamp_array_lambda = [&](const std::string& timezone) + { + return deserialize_non_owning_timestamp_array( + record_batch, + encapsulated_message.body(), + name, + metadata, + nullable, + buffer_index, + timezone + ); + }; + + const auto deserialize_non_owning_timestamp_without_timezone_array_lambda = [&]() + { + return deserialize_non_owning_timestamp_without_timezone_array( + record_batch, + encapsulated_message.body(), + name, + metadata, + nullable, + buffer_index + ); + }; + + const auto deserialize_non_owning_time_array_lambda = [&]() + { + return deserialize_non_owning_time_array( + record_batch, + encapsulated_message.body(), + name, + metadata, + nullable, + buffer_index + ); + }; + switch (field_type) { case org::apache::arrow::flatbuf::Type::Bool: @@ -336,6 +386,85 @@ namespace sparrow_ipc } } break; + case org::apache::arrow::flatbuf::Type::Timestamp: + { + const auto timestamp_type = field->type_as_Timestamp(); + const auto time_unit = timestamp_type->unit(); + const bool has_timezone = timestamp_type->timezone() != nullptr; + + if (has_timezone) + { + const std::string timezone = timestamp_type->timezone()->str(); + switch (time_unit) + { + case org::apache::arrow::flatbuf::TimeUnit::SECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()(timezone)); + break; + case org::apache::arrow::flatbuf::TimeUnit::MILLISECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()(timezone)); + break; + case org::apache::arrow::flatbuf::TimeUnit::MICROSECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()(timezone)); + break; + case org::apache::arrow::flatbuf::TimeUnit::NANOSECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()(timezone)); + break; + } + } + else + { + switch (time_unit) + { + case org::apache::arrow::flatbuf::TimeUnit::SECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::TimeUnit::MILLISECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::TimeUnit::MICROSECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::TimeUnit::NANOSECOND: + arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()()); + break; + } + } + break; + } + case org::apache::arrow::flatbuf::Type::Date: + { + const auto date_type = field->type_as_Date(); + const auto date_unit = date_type->unit(); + switch (date_unit) + { + case org::apache::arrow::flatbuf::DateUnit::DAY: + arrays.emplace_back(deserialize_non_owning_date_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::DateUnit::MILLISECOND: + arrays.emplace_back(deserialize_non_owning_date_array_lambda.template operator()()); + break; + } + break; + } + case org::apache::arrow::flatbuf::Type::Time: + { + const auto time_type = field->type_as_Time(); + const auto time_unit = time_type->unit(); + switch (time_unit) + { + case org::apache::arrow::flatbuf::TimeUnit::SECOND: + arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::TimeUnit::MILLISECOND: + arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::TimeUnit::MICROSECOND: + arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::TimeUnit::NANOSECOND: + arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()()); + break; + } case org::apache::arrow::flatbuf::Type::Null: arrays.emplace_back(deserialize_non_owning_null( record_batch, diff --git a/src/flatbuffer_utils.cpp b/src/flatbuffer_utils.cpp index db6fb75..d8ee511 100644 --- a/src/flatbuffer_utils.cpp +++ b/src/flatbuffer_utils.cpp @@ -136,33 +136,61 @@ namespace sparrow_ipc } case sparrow::data_type::TIMESTAMP_SECONDS: { + const auto timezone = utils::parse_format_string(format_str, ":"); + flatbuffers::Offset timezone_offset = 0; + if (timezone.has_value() && !timezone.value().empty()) + { + timezone_offset = builder.CreateString(timezone.value()); + } const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( builder, - org::apache::arrow::flatbuf::TimeUnit::SECOND + org::apache::arrow::flatbuf::TimeUnit::SECOND, + timezone_offset ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_MILLISECONDS: { + const auto timezone = utils::parse_format_string(format_str, ":"); + flatbuffers::Offset timezone_offset = 0; + if (timezone.has_value() && !timezone.value().empty()) + { + timezone_offset = builder.CreateString(timezone.value()); + } const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( builder, - org::apache::arrow::flatbuf::TimeUnit::MILLISECOND + org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, + timezone_offset ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_MICROSECONDS: { + const auto timezone = utils::parse_format_string(format_str, ":"); + flatbuffers::Offset timezone_offset = 0; + if (timezone.has_value() && !timezone.value().empty()) + { + timezone_offset = builder.CreateString(timezone.value()); + } const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( builder, - org::apache::arrow::flatbuf::TimeUnit::MICROSECOND + org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, + timezone_offset ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_NANOSECONDS: { + const auto timezone = utils::parse_format_string(format_str, ":"); + flatbuffers::Offset timezone_offset = 0; + if (timezone.has_value() && !timezone.value().empty()) + { + timezone_offset = builder.CreateString(timezone.value()); + } const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( builder, - org::apache::arrow::flatbuf::TimeUnit::NANOSECOND + org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, + timezone_offset ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } diff --git a/src/utils.cpp b/src/utils.cpp index b4e3d3f..2303538 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -29,6 +29,16 @@ namespace sparrow_ipc::utils return substr_size; } + std::optional parse_format_string(std::string_view format_str, std::string_view sep) + { + const size_t sep_pos = format_str.find(sep); + if (sep_pos == std::string_view::npos) + { + return std::nullopt; + } + return format_str.substr(sep_pos + sep.length()); + } + size_t align_to_8(const size_t n) { return (n + 7) & -8; diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index 98b1db8..f5ca996 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -36,6 +36,7 @@ const std::vector files_paths_to_test = { tests_resources_files_path / "generated_binary_no_batches", tests_resources_files_path / "generated_interval", tests_resources_files_path / "generated_duration", + tests_resources_files_path / "generated_datetime", tests_resources_files_path / "generated_null", tests_resources_files_path / "generated_null_trivial", tests_resources_files_path / "generated_decimal32", From 786ebe27e19ea9c344eecb3a8215a2429cc019c5 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 12 Dec 2025 17:45:44 +0100 Subject: [PATCH 02/15] Some cleanup --- CMakeLists.txt | 1 + .../deserialize_primitive_array.hpp | 265 ----------------- .../deserialize_time_related_arrays.hpp | 280 ++++++++++++++++++ include/sparrow_ipc/utils.hpp | 8 +- src/deserialize.cpp | 1 + src/flatbuffer_utils.cpp | 74 +++-- src/utils.cpp | 14 +- 7 files changed, 332 insertions(+), 311 deletions(-) create mode 100644 include/sparrow_ipc/deserialize_time_related_arrays.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0326309..ec82fec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,6 +130,7 @@ set(SPARROW_IPC_HEADERS ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_interval_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_null_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_time_related_arrays.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_utils.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize.hpp diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index cc65b33..6904f03 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -4,278 +4,13 @@ #include #include -#include #include -#include -#include -#include #include "Message_generated.h" #include "sparrow_ipc/deserialize_array_impl.hpp" namespace sparrow_ipc { - template - [[nodiscard]] sparrow::date_array deserialize_non_owning_date_array( - const org::apache::arrow::flatbuf::RecordBatch& record_batch, - std::span body, - std::string_view name, - const std::optional>& metadata, - bool nullable, - size_t& buffer_index - ) - { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), - metadata, - flags, - 0, - nullptr, - nullptr - ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) - ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::date_array{std::move(ap)}; - } - - template - [[nodiscard]] sparrow::timestamp_array deserialize_non_owning_timestamp_array( - const org::apache::arrow::flatbuf::RecordBatch& record_batch, - std::span body, - std::string_view name, - const std::optional>& metadata, - bool nullable, - size_t& buffer_index, - const std::string& timezone - ) - { - std::string format = std::string(data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - )) + timezone; - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format.c_str(), - name.data(), - metadata, - flags, - 0, - nullptr, - nullptr - ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) - ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::timestamp_array{std::move(ap)}; - } - - template - [[nodiscard]] sparrow::timestamp_without_timezone_array deserialize_non_owning_timestamp_without_timezone_array( - const org::apache::arrow::flatbuf::RecordBatch& record_batch, - std::span body, - std::string_view name, - const std::optional>& metadata, - bool nullable, - size_t& buffer_index - ) - { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), - metadata, - flags, - 0, - nullptr, - nullptr - ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) - ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::timestamp_without_timezone_array{std::move(ap)}; - } - - template - [[nodiscard]] sparrow::time_array deserialize_non_owning_time_array( - const org::apache::arrow::flatbuf::RecordBatch& record_batch, - std::span body, - std::string_view name, - const std::optional>& metadata, - bool nullable, - size_t& buffer_index - ) - { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), - metadata, - flags, - 0, - nullptr, - nullptr - ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) - ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::time_array{std::move(ap)}; - } - template [[nodiscard]] sparrow::primitive_array deserialize_non_owning_primitive_array( const org::apache::arrow::flatbuf::RecordBatch& record_batch, diff --git a/include/sparrow_ipc/deserialize_time_related_arrays.hpp b/include/sparrow_ipc/deserialize_time_related_arrays.hpp new file mode 100644 index 0000000..4dcca59 --- /dev/null +++ b/include/sparrow_ipc/deserialize_time_related_arrays.hpp @@ -0,0 +1,280 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "Message_generated.h" +#include "sparrow_ipc/arrow_interface/arrow_array.hpp" +#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" +#include "sparrow_ipc/deserialize_utils.hpp" + +namespace sparrow_ipc +{ + template + [[nodiscard]] sparrow::date_array deserialize_non_owning_date_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::date_array{std::move(ap)}; + } + + template + [[nodiscard]] sparrow::timestamp_array deserialize_non_owning_timestamp_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index, + const std::string& timezone + ) + { + std::string format = std::string(data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + )) + timezone; + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format.c_str(), + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::timestamp_array{std::move(ap)}; + } + + template + [[nodiscard]] sparrow::timestamp_without_timezone_array deserialize_non_owning_timestamp_without_timezone_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::timestamp_without_timezone_array{std::move(ap)}; + } + + template + [[nodiscard]] sparrow::time_array deserialize_non_owning_time_array( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + bool nullable, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + + // Set up flags based on nullable + std::optional> flags; + if (nullable) + { + flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; + } + + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + flags, + 0, + nullptr, + nullptr + ); + + const auto compression = record_batch.compression(); + std::vector buffers; + + auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); + + if (compression) + { + buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); + buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); + } + else + { + buffers.push_back(validity_buffer_span); + buffers.push_back(data_buffer_span); + } + + // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + ArrowArray array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::time_array{std::move(ap)}; + } +} diff --git a/include/sparrow_ipc/utils.hpp b/include/sparrow_ipc/utils.hpp index 8502cbf..4f7befe 100644 --- a/include/sparrow_ipc/utils.hpp +++ b/include/sparrow_ipc/utils.hpp @@ -11,7 +11,10 @@ namespace sparrow_ipc::utils { // Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies - SPARROW_IPC_API size_t align_to_8(const size_t n); + inline size_t align_to_8(const size_t n) + { + return (n + 7) & -8; + } /** * @brief Extracts words after ':' separated by ',' from a string. @@ -93,11 +96,10 @@ namespace sparrow_ipc::utils return true; } + std::optional parse_after_separator(std::string_view format_str, std::string_view sep); // Parse the format string // The format string is expected to be "w:size", "+w:size", "d:precision,scale", etc std::optional parse_format(std::string_view format_str, std::string_view sep); - // Used to parse timezone - std::optional parse_format_string(std::string_view format_str, std::string_view sep); // size_t calculate_output_serialized_size(const sparrow::record_batch& record_batch); /** diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 168cb7d..4e533e4 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -8,6 +8,7 @@ #include "sparrow_ipc/deserialize_interval_array.hpp" #include "sparrow_ipc/deserialize_null_array.hpp" #include "sparrow_ipc/deserialize_primitive_array.hpp" +#include "sparrow_ipc/deserialize_time_related_arrays.hpp" #include "sparrow_ipc/deserialize_variable_size_binary_array.hpp" #include "sparrow_ipc/encapsulated_message.hpp" #include "sparrow_ipc/magic_values.hpp" diff --git a/src/flatbuffer_utils.cpp b/src/flatbuffer_utils.cpp index d8ee511..a8bf7ef 100644 --- a/src/flatbuffer_utils.cpp +++ b/src/flatbuffer_utils.cpp @@ -8,6 +8,28 @@ namespace sparrow_ipc { + namespace + { + std::pair> + get_flatbuffer_timestamp_type( + flatbuffers::FlatBufferBuilder& builder, + std::string_view format_str, + org::apache::arrow::flatbuf::TimeUnit time_unit) + { + const auto timezone = utils::parse_after_separator(format_str, ":"); + flatbuffers::Offset timezone_offset = 0; + if (timezone.has_value() && !timezone.value().empty()) + { + timezone_offset = builder.CreateString(timezone.value()); + } + const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + builder, + time_unit, + timezone_offset); + return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; + } + } + std::pair> get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str) { @@ -136,63 +158,35 @@ namespace sparrow_ipc } case sparrow::data_type::TIMESTAMP_SECONDS: { - const auto timezone = utils::parse_format_string(format_str, ":"); - flatbuffers::Offset timezone_offset = 0; - if (timezone.has_value() && !timezone.value().empty()) - { - timezone_offset = builder.CreateString(timezone.value()); - } - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + return get_flatbuffer_timestamp_type( builder, - org::apache::arrow::flatbuf::TimeUnit::SECOND, - timezone_offset + format_str, + org::apache::arrow::flatbuf::TimeUnit::SECOND ); - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_MILLISECONDS: { - const auto timezone = utils::parse_format_string(format_str, ":"); - flatbuffers::Offset timezone_offset = 0; - if (timezone.has_value() && !timezone.value().empty()) - { - timezone_offset = builder.CreateString(timezone.value()); - } - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + return get_flatbuffer_timestamp_type( builder, - org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, - timezone_offset + format_str, + org::apache::arrow::flatbuf::TimeUnit::MILLISECOND ); - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_MICROSECONDS: { - const auto timezone = utils::parse_format_string(format_str, ":"); - flatbuffers::Offset timezone_offset = 0; - if (timezone.has_value() && !timezone.value().empty()) - { - timezone_offset = builder.CreateString(timezone.value()); - } - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + return get_flatbuffer_timestamp_type( builder, - org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, - timezone_offset + format_str, + org::apache::arrow::flatbuf::TimeUnit::MICROSECOND ); - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_NANOSECONDS: { - const auto timezone = utils::parse_format_string(format_str, ":"); - flatbuffers::Offset timezone_offset = 0; - if (timezone.has_value() && !timezone.value().empty()) - { - timezone_offset = builder.CreateString(timezone.value()); - } - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + return get_flatbuffer_timestamp_type( builder, - org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, - timezone_offset + format_str, + org::apache::arrow::flatbuf::TimeUnit::NANOSECOND ); - return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::DURATION_SECONDS: { diff --git a/src/utils.cpp b/src/utils.cpp index 2303538..fb7fa9a 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -4,16 +4,24 @@ namespace sparrow_ipc::utils { - std::optional parse_format(std::string_view format_str, std::string_view sep) + std::optional parse_after_separator(std::string_view format_str, std::string_view sep) { - // Find the position of the delimiter const auto sep_pos = format_str.find(sep); if (sep_pos == std::string_view::npos) { return std::nullopt; } + return format_str.substr(sep_pos + sep.length()); + } - std::string_view substr_str(format_str.data() + sep_pos + 1, format_str.size() - sep_pos - 1); + std::optional parse_format(std::string_view format_str, std::string_view sep) + { + const auto substr_opt = parse_after_separator(format_str, sep); + if (!substr_opt) + { + return std::nullopt; + } + const auto& substr_str = substr_opt.value(); int32_t substr_size = 0; const auto [ptr, ec] = std::from_chars( From cfd5eb55fe10162b9dbf4c660ecc947cebf2cccd Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 15 Dec 2025 12:11:40 +0100 Subject: [PATCH 03/15] Refactor --- .../sparrow_ipc/deserialize_array_impl.hpp | 12 +- .../deserialize_time_related_arrays.hpp | 236 +++--------------- src/deserialize.cpp | 2 + 3 files changed, 42 insertions(+), 208 deletions(-) diff --git a/include/sparrow_ipc/deserialize_array_impl.hpp b/include/sparrow_ipc/deserialize_array_impl.hpp index ee78122..88cf4e7 100644 --- a/include/sparrow_ipc/deserialize_array_impl.hpp +++ b/include/sparrow_ipc/deserialize_array_impl.hpp @@ -1,6 +1,9 @@ #pragma once #include +#include +#include +#include #include #include @@ -38,12 +41,13 @@ namespace sparrow_ipc::detail std::string_view name, const std::optional>& metadata, bool nullable, - size_t& buffer_index + size_t& buffer_index, + std::optional format_override = std::nullopt ) { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); + const std::string_view format = format_override.has_value() + ? *format_override + : data_type_to_format(sparrow::detail::get_data_type_from_array>::get()); // Set up flags based on nullable std::optional> flags; diff --git a/include/sparrow_ipc/deserialize_time_related_arrays.hpp b/include/sparrow_ipc/deserialize_time_related_arrays.hpp index 4dcca59..85fd155 100644 --- a/include/sparrow_ipc/deserialize_time_related_arrays.hpp +++ b/include/sparrow_ipc/deserialize_time_related_arrays.hpp @@ -1,7 +1,9 @@ #pragma once #include -#include +#include +#include +#include #include #include @@ -11,9 +13,7 @@ #include #include "Message_generated.h" -#include "sparrow_ipc/arrow_interface/arrow_array.hpp" -#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" -#include "sparrow_ipc/deserialize_utils.hpp" +#include "sparrow_ipc/deserialize_array_impl.hpp" namespace sparrow_ipc { @@ -27,59 +27,15 @@ namespace sparrow_ipc size_t& buffer_index ) { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), + return detail::deserialize_non_owning_simple_array( + record_batch, + body, + name, metadata, - flags, - 0, - nullptr, - nullptr - ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) + nullable, + buffer_index, + std::nullopt ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::date_array{std::move(ap)}; } template @@ -97,55 +53,15 @@ namespace sparrow_ipc sparrow::detail::get_data_type_from_array>::get() )) + timezone; - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format.c_str(), - name.data(), + return detail::deserialize_non_owning_simple_array( + record_batch, + body, + name, metadata, - flags, - 0, - nullptr, - nullptr + nullable, + buffer_index, + std::move(format) ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) - ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::timestamp_array{std::move(ap)}; } template @@ -158,59 +74,15 @@ namespace sparrow_ipc size_t& buffer_index ) { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), + return detail::deserialize_non_owning_simple_array( + record_batch, + body, + name, metadata, - flags, - 0, - nullptr, - nullptr - ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) + nullable, + buffer_index, + std::nullopt ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::timestamp_without_timezone_array{std::move(ap)}; } template @@ -223,58 +95,14 @@ namespace sparrow_ipc size_t& buffer_index ) { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array>::get() - ); - - // Set up flags based on nullable - std::optional> flags; - if (nullable) - { - flags = std::unordered_set{sparrow::ArrowFlag::NULLABLE}; - } - - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), + return detail::deserialize_non_owning_simple_array( + record_batch, + body, + name, metadata, - flags, - 0, - nullptr, - nullptr + nullable, + buffer_index, + std::nullopt ); - - const auto compression = record_batch.compression(); - std::vector buffers; - - auto validity_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - auto data_buffer_span = utils::get_buffer(record_batch, body, buffer_index); - - if (compression) - { - buffers.push_back(utils::get_decompressed_buffer(validity_buffer_span, compression)); - buffers.push_back(utils::get_decompressed_buffer(data_buffer_span, compression)); - } - else - { - buffers.push_back(validity_buffer_span); - buffers.push_back(data_buffer_span); - } - - // TODO bitmap_ptr is not used anymore... Leave it for now, and remove later if no need confirmed - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); - - ArrowArray array = make_arrow_array( - record_batch.length(), - null_count, - 0, - 0, - nullptr, - nullptr, - std::move(buffers) - ); - - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::time_array{std::move(ap)}; } } diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 4e533e4..3fea9b2 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -466,6 +466,8 @@ namespace sparrow_ipc arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()()); break; } + break; + } case org::apache::arrow::flatbuf::Type::Null: arrays.emplace_back(deserialize_non_owning_null( record_batch, From f03e2dcca366acf2c512c2a4f6179e5ef442bbaf Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 15 Dec 2025 13:58:00 +0100 Subject: [PATCH 04/15] Add debug --- .github/workflows/windows.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index baeb113..73ed64c 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -55,7 +55,7 @@ jobs: - name: Run tests working-directory: build run: | - cmake --build . --config ${{ matrix.build_type }} --target run_tests_with_junit_report + cmake --build . --config ${{ matrix.build_type }} --target run_tests - name: Build example working-directory: build @@ -121,7 +121,7 @@ jobs: - name: Run tests working-directory: build - run: cmake --build . --config ${{ matrix.build_type }} --target run_tests_with_junit_report + run: cmake --build . --config ${{ matrix.build_type }} --target run_tests - name: Build example working-directory: build From e142955fb65d0e5b7d0f9e38bacd9a04612f7630 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 15 Dec 2025 14:08:18 +0100 Subject: [PATCH 05/15] Check tests log --- .github/workflows/windows.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 73ed64c..bd680a1 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -55,7 +55,8 @@ jobs: - name: Run tests working-directory: build run: | - cmake --build . --config ${{ matrix.build_type }} --target run_tests + ctest --output-on-failure + cat tests/Testing/Temporary/LastTest.log - name: Build example working-directory: build @@ -121,7 +122,9 @@ jobs: - name: Run tests working-directory: build - run: cmake --build . --config ${{ matrix.build_type }} --target run_tests + run: | + ctest --output-on-failure + cat tests/Testing/Temporary/LastTest.log - name: Build example working-directory: build From bf91b873ff84b9d9d0fd0e85bc12d91c93e6833f Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 15 Dec 2025 14:16:48 +0100 Subject: [PATCH 06/15] Add matrix config --- .github/workflows/windows.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index bd680a1..af38754 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -55,7 +55,7 @@ jobs: - name: Run tests working-directory: build run: | - ctest --output-on-failure + ctest -C ${{ matrix.build_type }} --output-on-failure cat tests/Testing/Temporary/LastTest.log - name: Build example @@ -123,7 +123,7 @@ jobs: - name: Run tests working-directory: build run: | - ctest --output-on-failure + ctest -C ${{ matrix.build_type }} --output-on-failure cat tests/Testing/Temporary/LastTest.log - name: Build example From ce009af7d8fab521720fc1835a94537710b6fafa Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 16 Dec 2025 13:51:23 +0100 Subject: [PATCH 07/15] Link sparrow-ipc to date --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ec82fec..b3e46ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -282,6 +282,8 @@ target_link_libraries(sparrow-ipc PRIVATE lz4::lz4 zstd::libzstd + date::date + date::date-tz ) # Ensure generated headers are available when building sparrow-ipc From 0f0395da558b46100548d0546be9a0895fbca389 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 16 Dec 2025 13:58:38 +0100 Subject: [PATCH 08/15] Add find_package --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b3e46ae..44ddb04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,6 +275,7 @@ target_include_directories(sparrow-ipc PRIVATE $) +find_package(date REQUIRED) target_link_libraries(sparrow-ipc PUBLIC sparrow::sparrow From 902769feb60f03b3eac3bfbf7df83d48edbe870c Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 16 Dec 2025 14:19:19 +0100 Subject: [PATCH 09/15] Link publically --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 44ddb04..d798a7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,7 +275,10 @@ target_include_directories(sparrow-ipc PRIVATE $) + find_package(date REQUIRED) +target_link_libraries(sparrow-ipc PUBLIC date::date date::date-tz) + target_link_libraries(sparrow-ipc PUBLIC sparrow::sparrow @@ -283,8 +286,6 @@ target_link_libraries(sparrow-ipc PRIVATE lz4::lz4 zstd::libzstd - date::date - date::date-tz ) # Ensure generated headers are available when building sparrow-ipc From 5f145760346031ad951060470ae100f6a8731d47 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 16 Dec 2025 15:00:04 +0100 Subject: [PATCH 10/15] Add logs for debug --- CMakeLists.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d798a7b..a45e151 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -277,6 +277,24 @@ target_include_directories(sparrow-ipc find_package(date REQUIRED) +if(date_FOUND) + message(STATUS "✅ Found date package: ${date_DIR}") +else() + message(STATUS "❌ date package not found!") +endif() + +if(date::date) + message(STATUS "✅ date::date available") +else() + message(STATUS "❌ date::date available") +endif() + +if(date::date-tz) + message(STATUS "✅ date::date-tz available") +else() + message(STATUS "❌ date::date-tz available") +endif() + target_link_libraries(sparrow-ipc PUBLIC date::date date::date-tz) target_link_libraries(sparrow-ipc From 84e1192b1ddf2f1a8bc9e9cfca662c7a657ee813 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 18 Dec 2025 10:32:34 +0100 Subject: [PATCH 11/15] Use sparrow 2.0.0 and remove debug logs --- .github/workflows/windows.yml | 7 ++---- CMakeLists.txt | 40 +++++++++++++++++------------------ 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index af38754..baeb113 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -55,8 +55,7 @@ jobs: - name: Run tests working-directory: build run: | - ctest -C ${{ matrix.build_type }} --output-on-failure - cat tests/Testing/Temporary/LastTest.log + cmake --build . --config ${{ matrix.build_type }} --target run_tests_with_junit_report - name: Build example working-directory: build @@ -122,9 +121,7 @@ jobs: - name: Run tests working-directory: build - run: | - ctest -C ${{ matrix.build_type }} --output-on-failure - cat tests/Testing/Temporary/LastTest.log + run: cmake --build . --config ${{ matrix.build_type }} --target run_tests_with_junit_report - name: Build example working-directory: build diff --git a/CMakeLists.txt b/CMakeLists.txt index a45e151..9569c8d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,26 +276,26 @@ target_include_directories(sparrow-ipc $) -find_package(date REQUIRED) -if(date_FOUND) - message(STATUS "✅ Found date package: ${date_DIR}") -else() - message(STATUS "❌ date package not found!") -endif() - -if(date::date) - message(STATUS "✅ date::date available") -else() - message(STATUS "❌ date::date available") -endif() - -if(date::date-tz) - message(STATUS "✅ date::date-tz available") -else() - message(STATUS "❌ date::date-tz available") -endif() - -target_link_libraries(sparrow-ipc PUBLIC date::date date::date-tz) +# find_package(date REQUIRED) +# if(date_FOUND) +# message(STATUS "✅ Found date package: ${date_DIR}") +# else() +# message(STATUS "❌ date package not found!") +# endif() +# +# if(date::date) +# message(STATUS "✅ date::date available") +# else() +# message(STATUS "❌ date::date available") +# endif() +# +# if(date::date-tz) +# message(STATUS "✅ date::date-tz available") +# else() +# message(STATUS "❌ date::date-tz available") +# endif() +# +# target_link_libraries(sparrow-ipc PUBLIC date::date date::date-tz) target_link_libraries(sparrow-ipc PUBLIC From 54a092e478d4bdaf3ef8153d31f3db1bcff41c03 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 18 Dec 2025 11:02:25 +0100 Subject: [PATCH 12/15] USE_DATE_POLYFILL=ON when building sparrow only if not windows --- cmake/external_dependencies.cmake | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index aea1225..32245f6 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -71,11 +71,23 @@ set(SPARROW_BUILD_SHARED ${SPARROW_IPC_BUILD_SHARED}) if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}) set(CREATE_JSON_READER_TARGET ON) endif() -find_package_or_fetch( - PACKAGE_NAME sparrow - GIT_REPOSITORY https://github.com/man-group/sparrow.git - TAG 2.0.0 -) + +if(NOT WIN32) + find_package_or_fetch( + PACKAGE_NAME sparrow + GIT_REPOSITORY https://github.com/man-group/sparrow.git + TAG 2.0.0 + CMAKE_ARGS + "USE_DATE_POLYFILL=ON" + ) +else() + find_package_or_fetch( + PACKAGE_NAME sparrow + GIT_REPOSITORY https://github.com/man-group/sparrow.git + TAG 2.0.0 + ) +endif() + unset(CREATE_JSON_READER_TARGET) if(NOT TARGET sparrow::sparrow) From 1a791d795c0489185be9c6ea54a076ca5e87a281 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 18 Dec 2025 11:20:27 +0100 Subject: [PATCH 13/15] Clean up --- CMakeLists.txt | 22 ---------------------- cmake/external_dependencies.cmake | 1 + conanfile.py | 2 +- 3 files changed, 2 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9569c8d..ec82fec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,28 +275,6 @@ target_include_directories(sparrow-ipc PRIVATE $) - -# find_package(date REQUIRED) -# if(date_FOUND) -# message(STATUS "✅ Found date package: ${date_DIR}") -# else() -# message(STATUS "❌ date package not found!") -# endif() -# -# if(date::date) -# message(STATUS "✅ date::date available") -# else() -# message(STATUS "❌ date::date available") -# endif() -# -# if(date::date-tz) -# message(STATUS "✅ date::date-tz available") -# else() -# message(STATUS "❌ date::date-tz available") -# endif() -# -# target_link_libraries(sparrow-ipc PUBLIC date::date date::date-tz) - target_link_libraries(sparrow-ipc PUBLIC sparrow::sparrow diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 32245f6..0c81da0 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -72,6 +72,7 @@ if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}) set(CREATE_JSON_READER_TARGET ON) endif() +# Be consistent with sparrow feedstock on conda-forge if(NOT WIN32) find_package_or_fetch( PACKAGE_NAME sparrow diff --git a/conanfile.py b/conanfile.py index 218d9bf..1409926 100644 --- a/conanfile.py +++ b/conanfile.py @@ -43,7 +43,7 @@ def configure(self): self.options.rm_safe("fPIC") def requirements(self): - self.requires("sparrow/2.0.0") + self.requires("sparrow/1.4.0") self.requires(f"flatbuffers/{self._flatbuffers_version}") self.requires("lz4/1.9.4") self.requires("zstd/1.5.7") From 92db7097db5b01056b641666e64e3a80b78bf220 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 19 Dec 2025 16:35:43 +0100 Subject: [PATCH 14/15] Fix after rebase --- cmake/external_dependencies.cmake | 21 +++++---------------- src/utils.cpp | 5 ----- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 0c81da0..2109ec1 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -72,22 +72,11 @@ if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}) set(CREATE_JSON_READER_TARGET ON) endif() -# Be consistent with sparrow feedstock on conda-forge -if(NOT WIN32) - find_package_or_fetch( - PACKAGE_NAME sparrow - GIT_REPOSITORY https://github.com/man-group/sparrow.git - TAG 2.0.0 - CMAKE_ARGS - "USE_DATE_POLYFILL=ON" - ) -else() - find_package_or_fetch( - PACKAGE_NAME sparrow - GIT_REPOSITORY https://github.com/man-group/sparrow.git - TAG 2.0.0 - ) -endif() +find_package_or_fetch( + PACKAGE_NAME sparrow + GIT_REPOSITORY https://github.com/man-group/sparrow.git + TAG 2.0.0 +) unset(CREATE_JSON_READER_TARGET) diff --git a/src/utils.cpp b/src/utils.cpp index fb7fa9a..e8d65f9 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -47,11 +47,6 @@ namespace sparrow_ipc::utils return format_str.substr(sep_pos + sep.length()); } - size_t align_to_8(const size_t n) - { - return (n + 7) & -8; - } - std::optional>> parse_decimal_format(std::string_view format_str) { // Format can be "d:precision,scale" or "d:precision,scale,bitWidth" From 111fbdd0206fab80105f76213b1eee7847fbf160 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 19 Dec 2025 16:47:22 +0100 Subject: [PATCH 15/15] Remove leftovers --- conanfile.py | 2 +- src/utils.cpp | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/conanfile.py b/conanfile.py index 1409926..218d9bf 100644 --- a/conanfile.py +++ b/conanfile.py @@ -43,7 +43,7 @@ def configure(self): self.options.rm_safe("fPIC") def requirements(self): - self.requires("sparrow/1.4.0") + self.requires("sparrow/2.0.0") self.requires(f"flatbuffers/{self._flatbuffers_version}") self.requires("lz4/1.9.4") self.requires("zstd/1.5.7") diff --git a/src/utils.cpp b/src/utils.cpp index e8d65f9..ed4ae79 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -37,16 +37,6 @@ namespace sparrow_ipc::utils return substr_size; } - std::optional parse_format_string(std::string_view format_str, std::string_view sep) - { - const size_t sep_pos = format_str.find(sep); - if (sep_pos == std::string_view::npos) - { - return std::nullopt; - } - return format_str.substr(sep_pos + sep.length()); - } - std::optional>> parse_decimal_format(std::string_view format_str) { // Format can be "d:precision,scale" or "d:precision,scale,bitWidth"