Skip to content
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ set(SPARROW_IPC_HEADERS
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_interval_array.hpp
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_null_array.hpp
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_time_related_arrays.hpp
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_utils.hpp
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp
${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize.hpp
Expand Down
2 changes: 2 additions & 0 deletions cmake/external_dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,13 @@ set(SPARROW_BUILD_SHARED ${SPARROW_IPC_BUILD_SHARED})
if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS})
set(CREATE_JSON_READER_TARGET ON)
endif()

find_package_or_fetch(
PACKAGE_NAME sparrow
GIT_REPOSITORY https://github.com/man-group/sparrow.git
TAG 2.0.0
)

unset(CREATE_JSON_READER_TARGET)

if(NOT TARGET sparrow::sparrow)
Expand Down
12 changes: 8 additions & 4 deletions include/sparrow_ipc/deserialize_array_impl.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#pragma once

#include <optional>
#include <span>
#include <string>
#include <string_view>
#include <unordered_set>
#include <vector>

Expand Down Expand Up @@ -38,12 +41,13 @@
std::string_view name,
const std::optional<std::vector<sparrow::metadata_pair>>& metadata,
bool nullable,
size_t& buffer_index
size_t& buffer_index,
std::optional<std::string> format_override = std::nullopt
)
{
const std::string_view format = data_type_to_format(
sparrow::detail::get_data_type_from_array<ArrayType<T>>::get()
);
const std::string_view format = format_override.has_value()
? *format_override
: data_type_to_format(sparrow::detail::get_data_type_from_array<ArrayType<T>>::get());

// Set up flags based on nullable
std::optional<std::unordered_set<sparrow::ArrowFlag>> flags;
Expand Down
108 changes: 108 additions & 0 deletions include/sparrow_ipc/deserialize_time_related_arrays.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#pragma once

#include <optional>
#include <span>
#include <string>
#include <string_view>
#include <vector>

#include <sparrow/arrow_interface/arrow_array_schema_proxy.hpp>
#include <sparrow/date_array.hpp>
#include <sparrow/time_array.hpp>
#include <sparrow/timestamp_array.hpp>
#include <sparrow/timestamp_without_timezone_array.hpp>

#include "Message_generated.h"
#include "sparrow_ipc/deserialize_array_impl.hpp"

namespace sparrow_ipc
{
template <typename T>
[[nodiscard]] sparrow::date_array<T> deserialize_non_owning_date_array(
const org::apache::arrow::flatbuf::RecordBatch& record_batch,
std::span<const uint8_t> body,
std::string_view name,
const std::optional<std::vector<sparrow::metadata_pair>>& metadata,
bool nullable,
size_t& buffer_index
)
{
return detail::deserialize_non_owning_simple_array<sparrow::date_array, T>(
record_batch,
body,
name,
metadata,
nullable,
buffer_index,
std::nullopt
);
}

template <typename T>
[[nodiscard]] sparrow::timestamp_array<T> deserialize_non_owning_timestamp_array(
const org::apache::arrow::flatbuf::RecordBatch& record_batch,
std::span<const uint8_t> body,
std::string_view name,
const std::optional<std::vector<sparrow::metadata_pair>>& metadata,
bool nullable,
size_t& buffer_index,
const std::string& timezone
)
{
std::string format = std::string(data_type_to_format(
sparrow::detail::get_data_type_from_array<sparrow::timestamp_array<T>>::get()
)) + timezone;

return detail::deserialize_non_owning_simple_array<sparrow::timestamp_array, T>(
record_batch,
body,
name,
metadata,
nullable,
buffer_index,
std::move(format)
);
}

template <typename T>
[[nodiscard]] sparrow::timestamp_without_timezone_array<T> deserialize_non_owning_timestamp_without_timezone_array(
const org::apache::arrow::flatbuf::RecordBatch& record_batch,
std::span<const uint8_t> body,
std::string_view name,
const std::optional<std::vector<sparrow::metadata_pair>>& metadata,
bool nullable,
size_t& buffer_index
)
{
return detail::deserialize_non_owning_simple_array<sparrow::timestamp_without_timezone_array, T>(
record_batch,
body,
name,
metadata,
nullable,
buffer_index,
std::nullopt
);
}

template <typename T>
[[nodiscard]] sparrow::time_array<T> deserialize_non_owning_time_array(
const org::apache::arrow::flatbuf::RecordBatch& record_batch,
std::span<const uint8_t> body,
std::string_view name,
const std::optional<std::vector<sparrow::metadata_pair>>& metadata,
bool nullable,
size_t& buffer_index
)
{
return detail::deserialize_non_owning_simple_array<sparrow::time_array, T>(
record_batch,
body,
name,
metadata,
nullable,
buffer_index,
std::nullopt
);
}
}
6 changes: 5 additions & 1 deletion include/sparrow_ipc/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#pragma once

#include <cstdint>
Expand All @@ -11,7 +11,10 @@
namespace sparrow_ipc::utils
{
// Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies
SPARROW_IPC_API size_t align_to_8(const size_t n);
inline size_t align_to_8(const size_t n)
{
return (n + 7) & -8;
}
Comment on lines +14 to +17
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is in the hpp now ?


/**
* @brief Extracts words after ':' separated by ',' from a string.
Expand Down Expand Up @@ -93,6 +96,7 @@
return true;
}

std::optional<std::string_view> parse_after_separator(std::string_view format_str, std::string_view sep);
// Parse the format string
// The format string is expected to be "w:size", "+w:size", "d:precision,scale", etc
std::optional<int32_t> parse_format(std::string_view format_str, std::string_view sep);
Expand Down
132 changes: 132 additions & 0 deletions src/deserialize.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#include "sparrow_ipc/deserialize.hpp"

#include <sparrow/types/data_type.hpp>
Expand All @@ -8,6 +8,7 @@
#include "sparrow_ipc/deserialize_interval_array.hpp"
#include "sparrow_ipc/deserialize_null_array.hpp"
#include "sparrow_ipc/deserialize_primitive_array.hpp"
#include "sparrow_ipc/deserialize_time_related_arrays.hpp"
#include "sparrow_ipc/deserialize_variable_size_binary_array.hpp"
#include "sparrow_ipc/encapsulated_message.hpp"
#include "sparrow_ipc/magic_values.hpp"
Expand Down Expand Up @@ -81,7 +82,7 @@
for (const auto field : *(schema.fields()))
{
const ::flatbuffers::Vector<::flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>*
fb_custom_metadata = field->custom_metadata();

Check warning on line 85 in src/deserialize.cpp

View workflow job for this annotation

GitHub Actions / build

src/deserialize.cpp:85:17 [clang-analyzer-deadcode.DeadStores]

Value stored to 'fb_custom_metadata' during its initialization is never read
const std::optional<std::vector<sparrow::metadata_pair>>& metadata = field_metadata[field_idx++];
const std::string name = field->name() == nullptr ? "" : field->name()->str();
const bool nullable = field->nullable();
Expand All @@ -98,6 +99,56 @@
buffer_index
);
};

const auto deserialize_non_owning_date_array_lambda = [&]<typename T>()
{
return deserialize_non_owning_date_array<T>(
record_batch,
encapsulated_message.body(),
name,
metadata,
nullable,
buffer_index
);
};

const auto deserialize_non_owning_timestamp_array_lambda = [&]<typename T>(const std::string& timezone)
{
return deserialize_non_owning_timestamp_array<T>(
record_batch,
encapsulated_message.body(),
name,
metadata,
nullable,
buffer_index,
timezone
);
};

const auto deserialize_non_owning_timestamp_without_timezone_array_lambda = [&]<typename T>()
{
return deserialize_non_owning_timestamp_without_timezone_array<T>(
record_batch,
encapsulated_message.body(),
name,
metadata,
nullable,
buffer_index
);
};

const auto deserialize_non_owning_time_array_lambda = [&]<typename T>()
{
return deserialize_non_owning_time_array<T>(
record_batch,
encapsulated_message.body(),
name,
metadata,
nullable,
buffer_index
);
};

switch (field_type)
{
case org::apache::arrow::flatbuf::Type::Bool:
Expand Down Expand Up @@ -336,6 +387,87 @@
}
}
break;
case org::apache::arrow::flatbuf::Type::Timestamp:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this swith start to be a bit big. We should create functions for each of the types. Let's do that in another PR

{
const auto timestamp_type = field->type_as_Timestamp();
const auto time_unit = timestamp_type->unit();
const bool has_timezone = timestamp_type->timezone() != nullptr;

if (has_timezone)
{
const std::string timezone = timestamp_type->timezone()->str();
switch (time_unit)
{
case org::apache::arrow::flatbuf::TimeUnit::SECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()<sparrow::timestamp_second>(timezone));
break;
case org::apache::arrow::flatbuf::TimeUnit::MILLISECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()<sparrow::timestamp_millisecond>(timezone));
break;
case org::apache::arrow::flatbuf::TimeUnit::MICROSECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()<sparrow::timestamp_microsecond>(timezone));
break;
case org::apache::arrow::flatbuf::TimeUnit::NANOSECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_array_lambda.template operator()<sparrow::timestamp_nanosecond>(timezone));
break;
}
}
else
{
switch (time_unit)
{
case org::apache::arrow::flatbuf::TimeUnit::SECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()<sparrow::zoned_time_without_timezone_seconds>());
break;
case org::apache::arrow::flatbuf::TimeUnit::MILLISECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()<sparrow::zoned_time_without_timezone_milliseconds>());
break;
case org::apache::arrow::flatbuf::TimeUnit::MICROSECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()<sparrow::zoned_time_without_timezone_microseconds>());
break;
case org::apache::arrow::flatbuf::TimeUnit::NANOSECOND:
arrays.emplace_back(deserialize_non_owning_timestamp_without_timezone_array_lambda.template operator()<sparrow::zoned_time_without_timezone_nanoseconds>());
break;
}
}
break;
}
case org::apache::arrow::flatbuf::Type::Date:
{
const auto date_type = field->type_as_Date();
const auto date_unit = date_type->unit();
switch (date_unit)
{
case org::apache::arrow::flatbuf::DateUnit::DAY:
arrays.emplace_back(deserialize_non_owning_date_array_lambda.template operator()<sparrow::date_days>());
break;
case org::apache::arrow::flatbuf::DateUnit::MILLISECOND:
arrays.emplace_back(deserialize_non_owning_date_array_lambda.template operator()<sparrow::date_milliseconds>());
break;
}
break;
}
case org::apache::arrow::flatbuf::Type::Time:
{
const auto time_type = field->type_as_Time();
const auto time_unit = time_type->unit();
switch (time_unit)
{
case org::apache::arrow::flatbuf::TimeUnit::SECOND:
arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()<sparrow::chrono::time_seconds>());
break;
case org::apache::arrow::flatbuf::TimeUnit::MILLISECOND:
arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()<sparrow::chrono::time_milliseconds>());
break;
case org::apache::arrow::flatbuf::TimeUnit::MICROSECOND:
arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()<sparrow::chrono::time_microseconds>());
break;
case org::apache::arrow::flatbuf::TimeUnit::NANOSECOND:
arrays.emplace_back(deserialize_non_owning_time_array_lambda.template operator()<sparrow::chrono::time_nanoseconds>());
break;
}
break;
}
case org::apache::arrow::flatbuf::Type::Null:
arrays.emplace_back(deserialize_non_owning_null(
record_batch,
Expand All @@ -351,7 +483,7 @@
const auto decimal_field = field->type_as_Decimal();
const auto scale = decimal_field->scale();
const auto precision = decimal_field->precision();
if (decimal_field->bitWidth() == 32)

Check warning on line 486 in src/deserialize.cpp

View workflow job for this annotation

GitHub Actions / build

src/deserialize.cpp:486:54 [cppcoreguidelines-avoid-magic-numbers]

32 is a magic number; consider replacing it with a named constant
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<int32_t>>(
Expand All @@ -366,7 +498,7 @@
)
);
}
else if (decimal_field->bitWidth() == 64)

Check warning on line 501 in src/deserialize.cpp

View workflow job for this annotation

GitHub Actions / build

src/deserialize.cpp:501:59 [cppcoreguidelines-avoid-magic-numbers]

64 is a magic number; consider replacing it with a named constant
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<int64_t>>(
Expand All @@ -381,7 +513,7 @@
)
);
}
else if (decimal_field->bitWidth() == 128)

Check warning on line 516 in src/deserialize.cpp

View workflow job for this annotation

GitHub Actions / build

src/deserialize.cpp:516:59 [cppcoreguidelines-avoid-magic-numbers]

128 is a magic number; consider replacing it with a named constant
{
arrays.emplace_back(
deserialize_non_owning_decimal<sparrow::decimal<sparrow::int128_t>>(
Expand Down
Loading
Loading