diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index be59ee9..d5df436 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,9 +18,9 @@ jobs: # We keep these separate since sometimes the derive fails when # independently built. run: | - cargo build -p arrow2_convert_derive - cargo build -p arrow2_convert - cargo build -p arrow2_convert_example_simple + cargo build -p arrow_convert_derive + cargo build -p arrow_convert + cargo build -p arrow_convert_example_simple clippy: name: Clippy diff --git a/Cargo.toml b/Cargo.toml index e840284..d82b277 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] +resolver = "2" members = [ - "arrow2_convert", - "arrow2_convert_derive", + "arrow_convert", + "arrow_convert_derive", "examples/simple" ] diff --git a/README.md b/README.md index 9c1f29c..e739d23 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# arrow2_convert +# arrow_convert -Provides an API on top of [`arrow2`](https://github.com/jorgecarleitao/arrow2) to convert between rust types and Arrow. +Provides an API on top of [`arrow-rs`](https://github.com/apache/arrow-rs) to convert between rust types and Arrow. This repository was ported from the directly converted from [`arrow2-convert`](https://github.com/DataEngineeringLabs/arrow2-convert) library for use with `arrow-rs`. The Arrow ecosystem provides many ways to convert between Arrow and other popular formats across several languages. This project aims to serve the need for rust-centric data pipelines to easily convert to/from Arrow with strong typing and arbitrary nesting. @@ -8,13 +8,13 @@ The Arrow ecosystem provides many ways to convert between Arrow and other popula The example below performs a round trip conversion of a struct with a single field. -Please see the [complex_example.rs](https://github.com/DataEngineeringLabs/arrow2-convert/blob/main/arrow2_convert/tests/complex_example.rs) for usage of the full functionality. +Please see the [complex_example.rs](https://github.com/Swoorup/arrow-convert/blob/main/arrow_convert/tests/complex_example.rs) for usage of the full functionality. ```rust /// Simple example -use arrow2::array::Array; -use arrow2_convert::{deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowField, ArrowSerialize, ArrowDeserialize}; +use arrow::array::{Array, ArrayRef}; +use arrow_convert::{deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowField, ArrowSerialize, ArrowDeserialize}; #[derive(Debug, Clone, PartialEq, ArrowField, ArrowSerialize, ArrowDeserialize)] pub struct Foo { @@ -30,11 +30,11 @@ fn main() { ]; // serialize to an arrow array. try_into_arrow() is enabled by the TryIntoArrow trait - let arrow_array: Box = original_array.try_into_arrow().unwrap(); + let arrow_array: ArrayRef = original_array.try_into_arrow().unwrap(); // which can be cast to an Arrow StructArray and be used for all kinds of IPC, FFI, etc. - // supported by `arrow2` - let struct_array= arrow_array.as_any().downcast_ref::().unwrap(); + // supported by `arrow` + let struct_array= arrow_array.as_any().downcast_ref::().unwrap(); assert_eq!(struct_array.len(), 3); // deserialize back to our original vector via TryIntoCollection trait. @@ -49,9 +49,9 @@ Types that implement the `ArrowField`, `ArrowSerialize` and `ArrowDeserialize` t The `ArrowField`, `ArrowSerialize` and `ArrowDeserialize` derive macros can be used to generate implementations of these traits for structs and enums. Custom implementations can also be defined for any type that needs to convert to/from Arrow by manually implementing the traits. -For serializing to arrow, `TryIntoArrow::try_into_arrow` can be used to serialize any iterable into an `arrow2::Array` or a `arrow2::Chunk`. `arrow2::Array` represents the in-memory Arrow layout. `arrow2::Chunk` represents a column group and can be used with `arrow2` API for other functionality such converting to parquet and arrow flight RPC. +For serializing to arrow, `TryIntoArrow::try_into_arrow` can be used to serialize any iterable into an `arrow::Array` or a `arrow::Chunk`. `arrow::Array` represents the in-memory Arrow layout. `arrow::Chunk` represents a column group and can be used with `arrow` API for other functionality such converting to parquet and arrow flight RPC. -For deserializing from arrow, the `TryIntoCollection::try_into_collection` can be used to deserialize from an `arrow2::Array` representation into any container that implements `FromIterator`. +For deserializing from arrow, the `TryIntoCollection::try_into_collection` can be used to deserialize from an `arrow::Array` representation into any container that implements `FromIterator`. ### Default implementations @@ -66,7 +66,7 @@ Default implementations of the above traits are provided for the following: - [`chrono::NaiveDate`], [`chrono::NaiveDateTime`] - Option if T implements `ArrowField` - Vec if T implements `ArrowField` -- Large Arrow types [`LargeBinary`], [`LargeString`], [`LargeList`] are supported via the `type` attribute. Please see the [complex_example.rs](./arrow2_convert/tests/complex_example.rs) for usage. +- Large Arrow types [`LargeBinary`], [`LargeString`], [`LargeList`] are supported via the `type` attribute. Please see the [complex_example.rs](./arrow_convert/tests/complex_example.rs) for usage. - Fixed size types [`FixedSizeBinary`], [`FixedSizeList`] are supported via the `FixedSizeVec` type override. - Note: nesting of [`FixedSizeList`] is not supported. @@ -83,8 +83,8 @@ i128 represents a decimal number and requires the precision and scale to be spec For example to use `i128` as a field in a struct: ```rust -use arrow2_convert::field::I128; -use arrow2_convert::ArrowField; +use arrow_convert::field::I128; +use arrow_convert::ArrowField; #[derive(Debug, ArrowField)] struct S { @@ -96,16 +96,17 @@ struct S { A `vec` can be converted. to/from arrow by using the `arrow_serialize_to_mutable_array` and `arrow_array_deserialize_iterator_as_type` methods. ```rust -use arrow2::array::{Array, MutableArray}; -use arrow2_convert::serialize::arrow_serialize_to_mutable_array; -use arrow2_convert::deserialize::arrow_array_deserialize_iterator_as_type; -use arrow2_convert::field::I128; +use arrow::array::{Array, ArrayBuilder, ArrayRef}; +use arrow_convert::serialize::arrow_serialize_to_mutable_array; +use arrow_convert::deserialize::arrow_array_deserialize_iterator_as_type; +use arrow_convert::field::I128; use std::borrow::Borrow; +use std::sync::Arc; fn convert_i128() { let original_array = vec![1 as i128, 2, 3]; - let b: Box = arrow_serialize_to_mutable_array::<_, I128<32,32>, _>( - &original_array).unwrap().as_box(); + let b: ArrayRef = Arc::new(arrow_serialize_to_mutable_array::<_, I128<32,32>, _>( + &original_array).unwrap().finish()); let round_trip: Vec = arrow_array_deserialize_iterator_as_type::<_, I128<32,32>>( b.borrow()).unwrap().collect(); assert_eq!(original_array, round_trip); @@ -123,7 +124,7 @@ Since the Arrow format only supports one level of validity, nested option types This is not an exhaustive list. Please open an issue if you need a feature. ## Memory -Pass-thru conversions perform a single memory copy. Deserialization performs a copy from arrow2 to the destination. Serialization performs a copy from the source to arrow2. In-place deserialization is theoretically possible but currently not supported. +Pass-thru conversions perform a single memory copy. Deserialization performs a copy from arrow to the destination. Serialization performs a copy from the source to arrow. In-place deserialization is theoretically possible but currently not supported. ## Internals @@ -131,9 +132,9 @@ Pass-thru conversions perform a single memory copy. Deserialization performs a c The design is inspired by serde. The `ArrowSerialize` and `ArrowDeserialize` are analogs of serde's `Serialize` and `Deserialize` respectively. -However unlike serde's traits provide an exhaustive and flexible mapping to the serde data model, arrow2_convert's traits provide a much more narrower mapping to arrow2's data structures. +However unlike serde's traits provide an exhaustive and flexible mapping to the serde data model, arrow_convert's traits provide a much more narrower mapping to arrow's data structures. -Specifically, the `ArrowSerialize` trait provides the logic to serialize a type to the corresponding `arrow2::array::MutableArray`. The `ArrowDeserialize` trait deserializes a type from the corresponding `arrow2::array::ArrowArray`. +Specifically, the `ArrowSerialize` trait provides the logic to serialize a type to the corresponding `arrow::array::ArrayBuilder`. The `ArrowDeserialize` trait deserializes a type from the corresponding `arrow::array::ArrowArray`. ### Workarounds @@ -141,7 +142,7 @@ Features such as partial implementation specialization and generic associated ty For example custom types need to explicitly enable Vec serialization via the `arrow_enable_vec_for_type` macro on the primitive type. This is needed since Vec is a special type in Arrow, but without implementation specialization there's no way to special-case it. -Availability of generaic associated types would simplify the implementation for large and fixed types, since a generic MutableArray can be defined. Ideally for code reusability, we wouldn’t have to reimplement `ArrowSerialize` and `ArrowDeserialize` for large and fixed size types since the primitive types are the same. However, this requires the trait functions to take a generic bounded mutable array as an argument instead of a single array type. This requires the `ArrowSerialize` and `ArrowDeserialize` implementations to be able to specify the bounds as part of the associated type, which is not possible without generic associated types. +Availability of generaic associated types would simplify the implementation for large and fixed types, since a generic ArrayBuilder can be defined. Ideally for code reusability, we wouldn’t have to reimplement `ArrowSerialize` and `ArrowDeserialize` for large and fixed size types since the primitive types are the same. However, this requires the trait functions to take a generic bounded mutable array as an argument instead of a single array type. This requires the `ArrowSerialize` and `ArrowDeserialize` implementations to be able to specify the bounds as part of the associated type, which is not possible without generic associated types. As a result, we’re forced to sacrifice code reusability and introduce a little bit of complexity by providing separate `ArrowSerialize` and `ArrowDeserialize` implementations for large and fixed size types via placeholder structures. This also requires introducing the `Type` associated type to `ArrowField` so that the arrow type can be overriden via a macro field attribute without affecting the actual type. diff --git a/arrow2_convert/Cargo.toml b/arrow2_convert/Cargo.toml deleted file mode 100644 index 6fc9ccc..0000000 --- a/arrow2_convert/Cargo.toml +++ /dev/null @@ -1,34 +0,0 @@ -[package] -name = "arrow2_convert" -version = "0.5.0" -authors = [ - "Jorge Leitao ", - "Chandra Penke ", -] -edition = "2021" -license = "Apache-2.0 OR MIT" -keywords = ["Arrow", "arrow2"] -repository = "https://github.com/DataEngineeringLabs/arrow2-convert" -description = "Convert between nested rust types and Arrow with arrow2" - -[dependencies] -arrow2 = "0.17" -arrow2_convert_derive = { version = "0.5.0", path = "../arrow2_convert_derive", optional = true } -chrono = { version = "0.4", default_features = false, features = ["std"] } -err-derive = "0.3" - -[dev-dependencies] -arrow2_convert_derive = { version = "0.5.0", path = "../arrow2_convert_derive" } -criterion = "0.4" -trybuild = "1.0" - -[features] -default = ["derive"] -derive = ["arrow2_convert_derive"] - -[lib] -bench = false - -[[bench]] -name = "bench" -harness = false diff --git a/arrow2_convert/src/deserialize.rs b/arrow2_convert/src/deserialize.rs deleted file mode 100644 index d8ea29d..0000000 --- a/arrow2_convert/src/deserialize.rs +++ /dev/null @@ -1,430 +0,0 @@ -//! Implementation and traits for deserializing from Arrow. - -use arrow2::{array::*, buffer::Buffer, types::NativeType}; -use chrono::{NaiveDate, NaiveDateTime}; - -use crate::field::*; - -/// Implemented by [`ArrowField`] that can be deserialized from arrow -pub trait ArrowDeserialize: ArrowField + Sized -where - Self::ArrayType: ArrowArray, - for<'a> &'a Self::ArrayType: IntoIterator, -{ - /// The `arrow2::Array` type corresponding to this field - type ArrayType; - - /// Deserialize this field from arrow - fn arrow_deserialize( - v: <&Self::ArrayType as IntoIterator>::Item, - ) -> Option<::Type>; - - #[inline] - #[doc(hidden)] - /// For internal use only - /// - /// This is an ugly hack to allow generating a blanket Option deserialize. - /// Ideally we would be able to capture the optional field of the iterator via - /// something like for<'a> &'a T::ArrayType: IntoIterator>, - /// However, the E parameter seems to confuse the borrow checker if it's a reference. - fn arrow_deserialize_internal( - v: <&Self::ArrayType as IntoIterator>::Item, - ) -> ::Type { - Self::arrow_deserialize(v).unwrap() - } -} - -/// Internal trait used to support deserialization and iteration of structs, and nested struct lists -/// -/// Trivial pass-thru implementations are provided for arrow2 arrays that implement IntoIterator. -/// -/// The derive macro generates implementations for typed struct arrays. -#[doc(hidden)] -pub trait ArrowArray -where - for<'a> &'a Self: IntoIterator, -{ - type BaseArrayType: Array; - - // Returns a typed iterator to the underlying elements of the array from an untyped Array reference. - fn iter_from_array_ref(b: &dyn Array) -> <&Self as IntoIterator>::IntoIter; -} - -// Macro to facilitate implementation for numeric types and numeric arrays. -macro_rules! impl_arrow_deserialize_primitive { - ($physical_type:ty) => { - impl ArrowDeserialize for $physical_type { - type ArrayType = PrimitiveArray<$physical_type>; - - #[inline] - fn arrow_deserialize<'a>(v: Option<&$physical_type>) -> Option { - v.map(|t| *t) - } - } - - impl_arrow_array!(PrimitiveArray<$physical_type>); - }; -} - -macro_rules! impl_arrow_array { - ($array:ty) => { - impl ArrowArray for $array { - type BaseArrayType = Self; - - #[inline] - fn iter_from_array_ref(b: &dyn Array) -> <&Self as IntoIterator>::IntoIter { - b.as_any() - .downcast_ref::() - .unwrap() - .into_iter() - } - } - }; -} - -// blanket implementation for optional fields -impl ArrowDeserialize for Option -where - T: ArrowDeserialize, - T::ArrayType: 'static + ArrowArray, - for<'a> &'a T::ArrayType: IntoIterator, -{ - type ArrayType = ::ArrayType; - - #[inline] - fn arrow_deserialize( - v: <&Self::ArrayType as IntoIterator>::Item, - ) -> Option<::Type> { - Self::arrow_deserialize_internal(v).map(Some) - } - - #[inline] - fn arrow_deserialize_internal( - v: <&Self::ArrayType as IntoIterator>::Item, - ) -> ::Type { - ::arrow_deserialize(v) - } -} - -impl_arrow_deserialize_primitive!(u8); -impl_arrow_deserialize_primitive!(u16); -impl_arrow_deserialize_primitive!(u32); -impl_arrow_deserialize_primitive!(u64); -impl_arrow_deserialize_primitive!(i8); -impl_arrow_deserialize_primitive!(i16); -impl_arrow_deserialize_primitive!(i32); -impl_arrow_deserialize_primitive!(i64); -impl_arrow_deserialize_primitive!(arrow2::types::f16); -impl_arrow_deserialize_primitive!(f32); -impl_arrow_deserialize_primitive!(f64); - -impl ArrowDeserialize for I128 { - type ArrayType = PrimitiveArray; - - #[inline] - fn arrow_deserialize<'a>(v: Option<&i128>) -> Option { - v.copied() - } -} - -impl_arrow_array!(PrimitiveArray); - -impl ArrowDeserialize for String { - type ArrayType = Utf8Array; - - #[inline] - fn arrow_deserialize(v: Option<&str>) -> Option { - v.map(|t| t.to_string()) - } -} - -impl ArrowDeserialize for LargeString { - type ArrayType = Utf8Array; - - #[inline] - fn arrow_deserialize(v: Option<&str>) -> Option { - v.map(|t| t.to_string()) - } -} - -impl ArrowDeserialize for bool { - type ArrayType = BooleanArray; - - #[inline] - fn arrow_deserialize(v: Option) -> Option { - v - } -} - -impl ArrowDeserialize for NaiveDateTime { - type ArrayType = PrimitiveArray; - - #[inline] - fn arrow_deserialize(v: Option<&i64>) -> Option { - v.map(|t| arrow2::temporal_conversions::timestamp_ns_to_datetime(*t)) - } -} - -impl ArrowDeserialize for NaiveDate { - type ArrayType = PrimitiveArray; - - #[inline] - fn arrow_deserialize(v: Option<&i32>) -> Option { - v.map(|t| arrow2::temporal_conversions::date32_to_date(*t)) - } -} - -/// Iterator for for [`BufferBinaryArray`] -pub struct BufferBinaryArrayIter<'a> { - index: usize, - array: &'a BinaryArray, -} - -impl<'a> Iterator for BufferBinaryArrayIter<'a> { - type Item = Option>; - - fn next(&mut self) -> Option { - if self.index >= self.array.len() { - None - } else { - if let Some(validity) = self.array.validity() { - if !validity.get_bit(self.index) { - self.index += 1; - return Some(None); - } - } - let (start, end) = self.array.offsets().start_end(self.index); - self.index += 1; - Some(Some(self.array.values().clone().sliced(start, end - start))) - } - } -} - -/// Internal `ArrowArray` helper to iterate over a `BinaryArray` while exposing Buffer slices -pub struct BufferBinaryArray; - -impl<'a> IntoIterator for &'a BufferBinaryArray { - type Item = Option>; - - type IntoIter = BufferBinaryArrayIter<'a>; - - fn into_iter(self) -> Self::IntoIter { - unimplemented!("Use iter_from_array_ref"); - } -} - -impl ArrowArray for BufferBinaryArray { - type BaseArrayType = BinaryArray; - #[inline] - fn iter_from_array_ref(a: &dyn Array) -> <&Self as IntoIterator>::IntoIter { - let b = a.as_any().downcast_ref::().unwrap(); - - BufferBinaryArrayIter { index: 0, array: b } - } -} - -impl ArrowDeserialize for Buffer { - type ArrayType = BufferBinaryArray; - - #[inline] - fn arrow_deserialize(v: Option>) -> Option { - v - } -} - -impl ArrowDeserialize for Vec { - type ArrayType = BinaryArray; - - #[inline] - fn arrow_deserialize(v: Option<&[u8]>) -> Option { - v.map(|t| t.to_vec()) - } -} - -impl ArrowDeserialize for LargeBinary { - type ArrayType = BinaryArray; - - #[inline] - fn arrow_deserialize(v: Option<&[u8]>) -> Option> { - v.map(|t| t.to_vec()) - } -} - -impl ArrowDeserialize for FixedSizeBinary { - type ArrayType = FixedSizeBinaryArray; - - #[inline] - fn arrow_deserialize(v: Option<&[u8]>) -> Option> { - v.map(|t| t.to_vec()) - } -} - -fn arrow_deserialize_vec_helper( - v: Option>, -) -> Option< as ArrowField>::Type> -where - T: ArrowDeserialize + ArrowEnableVecForType + 'static, - for<'a> &'a T::ArrayType: IntoIterator, -{ - use std::ops::Deref; - v.map(|t| { - arrow_array_deserialize_iterator_internal::<::Type, T>(t.deref()) - .collect::::Type>>() - }) -} - -// Blanket implementation for Buffer -impl ArrowDeserialize for Buffer -where - T: ArrowDeserialize + NativeType + ArrowEnableVecForType, - for<'b> &'b ::ArrayType: IntoIterator, -{ - type ArrayType = ListArray; - - #[inline] - fn arrow_deserialize( - v: <&Self::ArrayType as IntoIterator>::Item, - ) -> Option<::Type> { - v.map(|t| { - t.as_any() - .downcast_ref::>() - .unwrap() - .values() - .clone() - }) - } -} - -// Blanket implementation for Vec -impl ArrowDeserialize for Vec -where - T: ArrowDeserialize + ArrowEnableVecForType + 'static, - ::ArrayType: 'static, - for<'b> &'b ::ArrayType: IntoIterator, -{ - type ArrayType = ListArray; - - fn arrow_deserialize(v: Option>) -> Option<::Type> { - arrow_deserialize_vec_helper::(v) - } -} - -impl ArrowDeserialize for LargeVec -where - T: ArrowDeserialize + ArrowEnableVecForType + 'static, - ::ArrayType: 'static, - for<'b> &'b ::ArrayType: IntoIterator, -{ - type ArrayType = ListArray; - - fn arrow_deserialize(v: Option>) -> Option<::Type> { - arrow_deserialize_vec_helper::(v) - } -} - -impl ArrowDeserialize for FixedSizeVec -where - T: ArrowDeserialize + ArrowEnableVecForType + 'static, - ::ArrayType: 'static, - for<'b> &'b ::ArrayType: IntoIterator, -{ - type ArrayType = FixedSizeListArray; - - fn arrow_deserialize(v: Option>) -> Option<::Type> { - arrow_deserialize_vec_helper::(v) - } -} - -impl_arrow_array!(BooleanArray); -impl_arrow_array!(Utf8Array); -impl_arrow_array!(Utf8Array); -impl_arrow_array!(BinaryArray); -impl_arrow_array!(BinaryArray); -impl_arrow_array!(FixedSizeBinaryArray); -impl_arrow_array!(ListArray); -impl_arrow_array!(ListArray); -impl_arrow_array!(FixedSizeListArray); - -/// Top-level API to deserialize from Arrow -pub trait TryIntoCollection -where - Element: ArrowField, - Collection: FromIterator, -{ - /// Convert from a `arrow2::Array` to any collection that implements the `FromIterator` trait - fn try_into_collection(self) -> arrow2::error::Result; - - /// Same as `try_into_collection` except can coerce the conversion to a specific Arrow type. This is - /// useful when the same rust type maps to one or more Arrow types for example `LargeString`. - fn try_into_collection_as_type(self) -> arrow2::error::Result - where - ArrowType: ArrowDeserialize + ArrowField + 'static, - for<'b> &'b ::ArrayType: IntoIterator; -} - -/// Helper to return an iterator for elements from a [`arrow2::array::Array`]. -fn arrow_array_deserialize_iterator_internal<'a, Element, Field>( - b: &'a dyn arrow2::array::Array, -) -> impl Iterator + 'a -where - Field: ArrowDeserialize + ArrowField + 'static, - for<'b> &'b ::ArrayType: IntoIterator, -{ - <::ArrayType as ArrowArray>::iter_from_array_ref(b) - .map(::arrow_deserialize_internal) -} - -/// Returns a typed iterator to a target type from an `arrow2::Array` -pub fn arrow_array_deserialize_iterator_as_type<'a, Element, ArrowType>( - arr: &'a dyn arrow2::array::Array, -) -> arrow2::error::Result + 'a> -where - Element: 'static, - ArrowType: ArrowDeserialize + ArrowField + 'static, - for<'b> &'b ::ArrayType: IntoIterator, -{ - if &::data_type() != arr.data_type() { - Err(arrow2::error::Error::InvalidArgumentError( - "Data type mismatch".to_string(), - )) - } else { - Ok(arrow_array_deserialize_iterator_internal::< - Element, - ArrowType, - >(arr)) - } -} - -/// Return an iterator that deserializes an [`Array`] to an element of type T -pub fn arrow_array_deserialize_iterator<'a, T>( - arr: &'a dyn arrow2::array::Array, -) -> arrow2::error::Result + 'a> -where - T: ArrowDeserialize + ArrowField + 'static, - for<'b> &'b ::ArrayType: IntoIterator, -{ - arrow_array_deserialize_iterator_as_type::(arr) -} - -impl TryIntoCollection for ArrowArray -where - Element: ArrowDeserialize + ArrowField + 'static, - for<'b> &'b ::ArrayType: IntoIterator, - ArrowArray: std::borrow::Borrow, - Collection: FromIterator, -{ - fn try_into_collection(self) -> arrow2::error::Result { - Ok(arrow_array_deserialize_iterator::(self.borrow())?.collect()) - } - - fn try_into_collection_as_type(self) -> arrow2::error::Result - where - ArrowType: ArrowDeserialize + ArrowField + 'static, - for<'b> &'b ::ArrayType: IntoIterator, - { - Ok( - arrow_array_deserialize_iterator_as_type::(self.borrow())? - .collect(), - ) - } -} diff --git a/arrow2_convert/src/serialize.rs b/arrow2_convert/src/serialize.rs deleted file mode 100644 index 3460445..0000000 --- a/arrow2_convert/src/serialize.rs +++ /dev/null @@ -1,534 +0,0 @@ -//! Implementation and traits for serializing to Arrow. - -use arrow2::array::*; -use arrow2::chunk::Chunk; -use arrow2::types::NativeType; -use arrow2::{array::Array, buffer::Buffer}; -use chrono::{NaiveDate, NaiveDateTime}; -use std::sync::Arc; - -use crate::field::*; - -/// Trait that is implemented by all types that are serializable to Arrow. -/// -/// Implementations are provided for all built-in arrow types as well as Vec, and Option -/// if T implements ArrowSerialize. -/// -/// Note that Vec implementation needs to be enabled by the [`crate::arrow_enable_vec_for_type`] macro. -pub trait ArrowSerialize: ArrowField { - /// The [`arrow2::array::MutableArray`] that holds this value - type MutableArrayType: arrow2::array::MutableArray; - - /// Create a new mutable array - fn new_array() -> Self::MutableArrayType; - - /// Serialize this field to arrow - fn arrow_serialize( - v: &::Type, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()>; -} - -// Macro to facilitate implementation of serializable traits for numeric types and numeric mutable arrays. -macro_rules! impl_numeric_type { - ($physical_type:ty) => { - impl ArrowSerialize for $physical_type { - type MutableArrayType = MutablePrimitiveArray<$physical_type>; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize( - v: &Self, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - array.try_push(Some(*v)) - } - } - }; -} - -// blanket implementation for optional fields -impl ArrowSerialize for Option -where - T: ArrowSerialize, -{ - type MutableArrayType = ::MutableArrayType; - - #[inline] - fn new_array() -> Self::MutableArrayType { - ::new_array() - } - - #[inline] - fn arrow_serialize( - v: &::Type, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - match v.as_ref() { - Some(t) => ::arrow_serialize(t, array), - None => { - array.push_null(); - Ok(()) - } - } - } -} - -impl_numeric_type!(u8); -impl_numeric_type!(u16); -impl_numeric_type!(u32); -impl_numeric_type!(u64); -impl_numeric_type!(i8); -impl_numeric_type!(i16); -impl_numeric_type!(i32); -impl_numeric_type!(i64); -impl_numeric_type!(arrow2::types::f16); -impl_numeric_type!(f32); -impl_numeric_type!(f64); - -impl ArrowSerialize for I128 { - type MutableArrayType = MutablePrimitiveArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize(v: &i128, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(*v)) - } -} - -impl ArrowSerialize for String { - type MutableArrayType = MutableUtf8Array; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(v)) - } -} - -impl ArrowSerialize for LargeString { - type MutableArrayType = MutableUtf8Array; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize( - v: &String, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - array.try_push(Some(v)) - } -} - -impl ArrowSerialize for bool { - type MutableArrayType = MutableBooleanArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(*v)) - } -} - -impl ArrowSerialize for NaiveDateTime { - type MutableArrayType = MutablePrimitiveArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::from(::data_type()) - } - - #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(v.timestamp_nanos())) - } -} - -impl ArrowSerialize for NaiveDate { - type MutableArrayType = MutablePrimitiveArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::from(::data_type()) - } - - #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some( - chrono::Datelike::num_days_from_ce(v) - - arrow2::temporal_conversions::EPOCH_DAYS_FROM_CE, - )) - } -} - -impl ArrowSerialize for Buffer { - type MutableArrayType = MutableBinaryArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(v.as_slice())) - } -} - -impl ArrowSerialize for Vec { - type MutableArrayType = MutableBinaryArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(v)) - } -} - -impl ArrowSerialize for LargeBinary { - type MutableArrayType = MutableBinaryArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() - } - - #[inline] - fn arrow_serialize( - v: &Vec, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - array.try_push(Some(v)) - } -} - -impl ArrowSerialize for FixedSizeBinary { - type MutableArrayType = MutableFixedSizeBinaryArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::new(SIZE) - } - - #[inline] - fn arrow_serialize( - v: &Vec, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - array.try_push(Some(v)) - } -} - -// Blanket implementation for Buffer -impl ArrowSerialize for Buffer -where - T: NativeType + ArrowSerialize + ArrowEnableVecForType, -{ - type MutableArrayType = MutableListArray>; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::new_with_field( - MutablePrimitiveArray::new(), - "item", - ::is_nullable(), - ) - } - - #[inline] - fn arrow_serialize( - v: &::Type, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - let values = array.mut_values(); - values.reserve(v.len()); - values.extend_from_slice(v.as_slice()); - array.try_push_valid() - } -} - -// Blanket implementation for Vec -impl ArrowSerialize for Vec -where - T: ArrowSerialize + ArrowEnableVecForType + 'static, - ::MutableArrayType: Default, -{ - type MutableArrayType = MutableListArray::MutableArrayType>; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::new_with_field( - ::new_array(), - "item", - ::is_nullable(), - ) - } - - fn arrow_serialize( - v: &::Type, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - let values = array.mut_values(); - for i in v.iter() { - ::arrow_serialize(i, values)?; - } - array.try_push_valid() - } -} - -impl ArrowSerialize for LargeVec -where - T: ArrowSerialize + ArrowEnableVecForType + 'static, - ::MutableArrayType: Default, -{ - type MutableArrayType = MutableListArray::MutableArrayType>; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::new_with_field( - ::new_array(), - "item", - ::is_nullable(), - ) - } - - fn arrow_serialize( - v: &::Type, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - let values = array.mut_values(); - for i in v.iter() { - ::arrow_serialize(i, values)?; - } - array.try_push_valid() - } -} - -impl ArrowSerialize for FixedSizeVec -where - T: ArrowSerialize + ArrowEnableVecForType + 'static, - ::MutableArrayType: Default, -{ - type MutableArrayType = MutableFixedSizeListArray<::MutableArrayType>; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::new_with_field( - ::new_array(), - "item", - ::is_nullable(), - SIZE, - ) - } - - fn arrow_serialize( - v: &::Type, - array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - let values = array.mut_values(); - for i in v.iter() { - ::arrow_serialize(i, values)?; - } - array.try_push_valid() - } -} - -// internal helper method to extend a mutable array -fn arrow_serialize_extend_internal< - 'a, - A: 'static, - T: ArrowSerialize + ArrowField + 'static, - I: IntoIterator, ->( - into_iter: I, - array: &mut ::MutableArrayType, -) -> arrow2::error::Result<()> { - let iter = into_iter.into_iter(); - array.reserve(iter.size_hint().0); - for i in iter { - ::arrow_serialize(i, array)?; - } - Ok(()) -} - -/// Serializes an iterator into an `arrow2::MutableArray` -pub fn arrow_serialize_to_mutable_array< - 'a, - A: 'static, - T: ArrowSerialize + ArrowField + 'static, - I: IntoIterator, ->( - into_iter: I, -) -> arrow2::error::Result<::MutableArrayType> { - let mut arr = ::new_array(); - arrow_serialize_extend_internal::(into_iter, &mut arr)?; - Ok(arr) -} - -/// API to flatten a Chunk consisting of an `arrow2::array::StructArray` into a `Chunk` consisting of `arrow2::array::Array`s contained by the `StructArray` -pub trait FlattenChunk { - /// Convert an `arrow2::chunk::Chunk` containing a `arrow2::array::StructArray` to an `arrow2::chunk::Chunk` consisting of the - /// `arrow::array::Array`s contained by the `StructArray` by consuming the - /// original `Chunk`. Returns an error if the `Chunk` cannot be flattened. - fn flatten(self) -> Result>, arrow2::error::Error>; -} - -impl FlattenChunk for Chunk -where - A: AsRef, -{ - fn flatten(self) -> Result>, arrow2::error::Error> { - let arrays = self.into_arrays(); - - // we only support flattening of a Chunk containing a single StructArray - if arrays.len() != 1 { - return Err(arrow2::error::Error::InvalidArgumentError( - "Chunk must contain a single Array".to_string(), - )); - } - - let array = &arrays[0]; - - let physical_type = array.as_ref().data_type().to_physical_type(); - if physical_type != arrow2::datatypes::PhysicalType::Struct { - return Err(arrow2::error::Error::InvalidArgumentError( - "Array in Chunk must be of type arrow2::datatypes::PhysicalType::Struct" - .to_string(), - )); - } - - let struct_array = array - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); - Ok(Chunk::new(struct_array.values().to_vec())) - } -} - -/// Top-level API to serialize to Arrow -pub trait TryIntoArrow<'a, ArrowArray, Element> -where - Self: IntoIterator, - Element: 'static, -{ - /// Convert from any iterable collection into an `arrow2::Array` - fn try_into_arrow(self) -> arrow2::error::Result; - - /// Convert from any iterable collection into an `arrow2::Array` by coercing the conversion to a specific Arrow type. - /// This is useful when the same rust type maps to one or more Arrow types for example `LargeString`. - fn try_into_arrow_as_type(self) -> arrow2::error::Result - where - ArrowType: ArrowSerialize + ArrowField + 'static; -} - -impl<'a, Element, Collection> TryIntoArrow<'a, Arc, Element> for Collection -where - Element: ArrowSerialize + ArrowField + 'static, - Collection: IntoIterator, -{ - fn try_into_arrow(self) -> arrow2::error::Result> { - Ok(arrow_serialize_to_mutable_array::(self)?.as_arc()) - } - - fn try_into_arrow_as_type(self) -> arrow2::error::Result> - where - Field: ArrowSerialize + ArrowField + 'static, - { - Ok(arrow_serialize_to_mutable_array::(self)?.as_arc()) - } -} - -impl<'a, Element, Collection> TryIntoArrow<'a, Box, Element> for Collection -where - Element: ArrowSerialize + ArrowField + 'static, - Collection: IntoIterator, -{ - fn try_into_arrow(self) -> arrow2::error::Result> { - Ok(arrow_serialize_to_mutable_array::(self)?.as_box()) - } - - fn try_into_arrow_as_type(self) -> arrow2::error::Result> - where - E: ArrowSerialize + ArrowField + 'static, - { - Ok(arrow_serialize_to_mutable_array::(self)?.as_box()) - } -} - -impl<'a, Element, Collection> TryIntoArrow<'a, Chunk>, Element> for Collection -where - Element: ArrowSerialize + ArrowField + 'static, - Collection: IntoIterator, -{ - fn try_into_arrow(self) -> arrow2::error::Result>> { - Ok(Chunk::new(vec![arrow_serialize_to_mutable_array::< - Element, - Element, - Collection, - >(self)? - .as_arc()])) - } - - fn try_into_arrow_as_type(self) -> arrow2::error::Result>> - where - Field: ArrowSerialize + ArrowField + 'static, - { - Ok(Chunk::new(vec![arrow_serialize_to_mutable_array::< - Element, - Field, - Collection, - >(self)? - .as_arc()])) - } -} - -impl<'a, Element, Collection> TryIntoArrow<'a, Chunk>, Element> for Collection -where - Element: ArrowSerialize + ArrowField + 'static, - Collection: IntoIterator, -{ - fn try_into_arrow(self) -> arrow2::error::Result>> { - Ok(Chunk::new(vec![arrow_serialize_to_mutable_array::< - Element, - Element, - Collection, - >(self)? - .as_box()])) - } - - fn try_into_arrow_as_type(self) -> arrow2::error::Result>> - where - E: ArrowSerialize + ArrowField + 'static, - { - Ok(Chunk::new(vec![arrow_serialize_to_mutable_array::< - Element, - E, - Collection, - >(self)? - .as_box()])) - } -} diff --git a/arrow2_convert/tests/test_flatten_chunk.rs b/arrow2_convert/tests/test_flatten_chunk.rs deleted file mode 100644 index a3a8d3f..0000000 --- a/arrow2_convert/tests/test_flatten_chunk.rs +++ /dev/null @@ -1,68 +0,0 @@ -use arrow2::array::*; -use arrow2::chunk::Chunk; -use arrow2_convert::{serialize::*, ArrowField, ArrowSerialize}; -use std::sync::Arc; - -#[test] -fn test_flatten_chunk() { - #[derive(Debug, Clone, ArrowField, ArrowSerialize)] - struct Struct { - a: i64, - b: i64, - } - - let target = Chunk::new(vec![ - Int64Array::from(&[Some(1), Some(2)]).boxed(), - Int64Array::from(&[Some(1), Some(2)]).boxed(), - ]); - - let array = vec![Struct { a: 1, b: 1 }, Struct { a: 2, b: 2 }]; - - let array: Box = array.try_into_arrow().unwrap(); - let chunk: Chunk> = Chunk::new(vec![array]); - - let flattened: Chunk> = chunk.flatten().unwrap(); - - assert_eq!(flattened, target); -} - -#[test] -fn test_flatten_chunk_empty_chunk_error() { - let chunk: Chunk> = Chunk::new(vec![]); - assert!(chunk.flatten().is_err()); -} - -#[test] -fn test_flatten_chunk_no_single_struct_array_error() { - #[derive(Debug, Clone, ArrowField, ArrowSerialize)] - struct Struct { - a: i64, - b: String, - } - - let array = vec![ - Struct { - a: 1, - b: "one".to_string(), - }, - Struct { - a: 2, - b: "two".to_string(), - }, - ]; - - let array: Arc = array.try_into_arrow().unwrap(); - - let arrays = vec![array.clone(), array.clone()]; - let chunk = Chunk::new(arrays); - - assert!(chunk.flatten().is_err()); -} - -#[test] -fn test_flatten_chunk_type_not_struct_error() { - let array: Arc = Int32Array::from(&[Some(1), None, Some(3)]).arced(); - let chunk = Chunk::new(vec![array]); - - assert!(chunk.flatten().is_err()); -} diff --git a/arrow2_convert/tests/test_serialize.rs b/arrow2_convert/tests/test_serialize.rs deleted file mode 100644 index 3fe7cad..0000000 --- a/arrow2_convert/tests/test_serialize.rs +++ /dev/null @@ -1,133 +0,0 @@ -use arrow2::array::Array; -use arrow2::buffer::Buffer; -use arrow2::chunk::Chunk; -use arrow2_convert::field::{ArrowField, FixedSizeBinary}; -use arrow2_convert::serialize::*; -use std::sync::Arc; - -#[test] -fn test_error_exceed_fixed_size_binary() { - let strs = [b"abc".to_vec()]; - let r: arrow2::error::Result> = - strs.try_into_arrow_as_type::>(); - assert!(r.is_err()) -} - -#[test] -fn test_chunk() { - let strs = [b"abc".to_vec()]; - let r: Chunk> = strs.try_into_arrow_as_type::>().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!( - r.arrays()[0].data_type(), - & as ArrowField>::data_type() - ); - - let r: Chunk> = strs.try_into_arrow().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!( - r.arrays()[0].data_type(), - & as ArrowField>::data_type() - ); - - let r: Chunk> = strs.try_into_arrow_as_type::>().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!( - r.arrays()[0].data_type(), - & as ArrowField>::data_type() - ); - - let r: Chunk> = strs.try_into_arrow().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!( - r.arrays()[0].data_type(), - & as ArrowField>::data_type() - ); -} - -#[test] -fn test_array() { - let strs = [b"abc".to_vec()]; - let r: Box = strs.try_into_arrow_as_type::>().unwrap(); - assert_eq!( - r.data_type(), - & as ArrowField>::data_type() - ); - - let r: Box = strs.try_into_arrow().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.data_type(), & as ArrowField>::data_type()); - - let r: Arc = strs.try_into_arrow_as_type::>().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!( - r.data_type(), - & as ArrowField>::data_type() - ); - - let r: Arc = strs.try_into_arrow().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.data_type(), & as ArrowField>::data_type()); -} - -#[test] -fn test_buffer() { - // Buffer and Vec should serialize into BinaryArray - let dat: Vec> = vec![(0..10).collect()]; - let r: Box = dat.try_into_arrow().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.data_type(), & as ArrowField>::data_type()); - assert_eq!(r.data_type(), & as ArrowField>::data_type()); - - // Buffer and Vec should serialize into ListArray - let dat: Vec> = vec![(0..10).collect()]; - let r: Box = dat.try_into_arrow().unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.data_type(), & as ArrowField>::data_type()); - assert_eq!(r.data_type(), & as ArrowField>::data_type()); -} - -#[test] -fn test_field_serialize_error() { - pub struct CustomType(u64); - - impl arrow2_convert::field::ArrowField for CustomType { - type Type = Self; - - #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Extension( - "custom".to_string(), - Box::new(arrow2::datatypes::DataType::UInt64), - None, - ) - } - } - - impl arrow2_convert::serialize::ArrowSerialize for CustomType { - type MutableArrayType = arrow2::array::MutablePrimitiveArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::from(::data_type()) - } - - #[inline] - fn arrow_serialize(_: &Self, _: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - Err(arrow2::error::Error::NotYetImplemented("".to_owned())) - } - } - - impl arrow2_convert::deserialize::ArrowDeserialize for CustomType { - type ArrayType = arrow2::array::PrimitiveArray; - - #[inline] - fn arrow_deserialize(v: Option<&u64>) -> Option { - v.map(|t| CustomType(*t)) - } - } - - let arr = vec![CustomType(0)]; - let r: arrow2::error::Result> = arr.try_into_arrow(); - assert!(r.is_err()) -} diff --git a/arrow_convert/Cargo.toml b/arrow_convert/Cargo.toml new file mode 100644 index 0000000..65cac97 --- /dev/null +++ b/arrow_convert/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "arrow_convert" +version = "0.6.0" +authors = [ + "Swoorup Joshi ", + "Jorge Leitao ", + "Chandra Penke ", +] +edition = "2021" +license = "Apache-2.0 OR MIT" +keywords = ["Arrow", "arrow"] +repository = "https://github.com/Swoorup/arrow-convert" +description = "Convert between nested rust types and Arrow with arrow" + +[dependencies] +arrow = "51.0" +arrow_convert_derive = { version = "0.6.0", path = "../arrow_convert_derive", optional = true } +half = { version = "2.1", default-features = false } +chrono = { version = "0.4", default_features = false, features = ["std"] } +err-derive = "0.3" + +[dev-dependencies] +arrow_convert_derive = { version = "0.6.0", path = "../arrow_convert_derive" } +criterion = "0.5" +trybuild = "1.0" + +[features] +default = ["derive"] +derive = ["arrow_convert_derive"] + +[lib] +bench = false + +[[bench]] +name = "bench" +harness = false diff --git a/arrow2_convert/LICENSE-APACHE b/arrow_convert/LICENSE-APACHE similarity index 100% rename from arrow2_convert/LICENSE-APACHE rename to arrow_convert/LICENSE-APACHE diff --git a/arrow2_convert/LICENSE-MIT b/arrow_convert/LICENSE-MIT similarity index 100% rename from arrow2_convert/LICENSE-MIT rename to arrow_convert/LICENSE-MIT diff --git a/arrow2_convert/README.md b/arrow_convert/README.md similarity index 100% rename from arrow2_convert/README.md rename to arrow_convert/README.md diff --git a/arrow2_convert/benches/bench.rs b/arrow_convert/benches/bench.rs similarity index 71% rename from arrow2_convert/benches/bench.rs rename to arrow_convert/benches/bench.rs index 09951e9..15879b5 100644 --- a/arrow2_convert/benches/bench.rs +++ b/arrow_convert/benches/bench.rs @@ -1,26 +1,29 @@ -use arrow2::{array::Array, buffer::Buffer}; -use arrow2_convert::{ +use arrow::{ + array::ArrayRef, + buffer::{Buffer, ScalarBuffer}, +}; +use arrow_convert::{ deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowDeserialize, ArrowField, ArrowSerialize, }; use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -// Arrow stores U8 arrays as `arrow2::array::BinaryArray` +// Arrow stores U8 arrays as `arrow::array::BinaryArray` #[derive(ArrowField, ArrowSerialize, ArrowDeserialize)] #[arrow_field(transparent)] -pub struct BufU8Struct(Buffer); +pub struct BufU8Struct(Buffer); -// Arrow stores other arrows as `arrow2::array::ListArray` +// Arrow stores other arrows as `arrow::array::ListArray` #[derive(ArrowField, ArrowSerialize, ArrowDeserialize)] #[arrow_field(transparent)] -pub struct BufU32Struct(Buffer); +pub struct BufU32Struct(ScalarBuffer); -// Arrow stores U8 arrows as `arrow2::array::BinaryArray` +// Arrow stores U8 arrows as `arrow::array::BinaryArray` #[derive(ArrowField, ArrowSerialize, ArrowDeserialize)] #[arrow_field(transparent)] pub struct VecU8Struct(Vec); -// Arrow stores other arrows as `arrow2::array::ListArray` +// Arrow stores other arrows as `arrow::array::ListArray` #[derive(ArrowField, ArrowSerialize, ArrowDeserialize)] #[arrow_field(transparent)] pub struct VecU32Struct(Vec); @@ -30,27 +33,27 @@ pub fn bench_buffer_serialize(c: &mut Criterion) { for size in [1, 10, 100, 1000, 10000].iter() { group.throughput(Throughput::Elements(*size as u64)); group.bench_with_input(BenchmarkId::new("BufferU8", size), size, |b, &size| { - let data = [BufU8Struct((0..size as u8).into_iter().collect())]; + let data = [BufU8Struct((0..size as u8).collect())]; b.iter(|| { - let _: Box = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); + let _: ArrayRef = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); }); }); group.bench_with_input(BenchmarkId::new("VecU8", size), size, |b, &size| { - let data = [VecU8Struct((0..size as u8).into_iter().collect())]; + let data = [VecU8Struct((0..size as u8).collect())]; b.iter(|| { - let _: Box = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); + let _: ArrayRef = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); }); }); group.bench_with_input(BenchmarkId::new("BufferU32", size), size, |b, &size| { - let data = [BufU32Struct((0..size as u32).into_iter().collect())]; + let data = [BufU32Struct((0..size as u32).collect())]; b.iter(|| { - let _: Box = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); + let _: ArrayRef = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); }); }); group.bench_with_input(BenchmarkId::new("VecU32", size), size, |b, &size| { - let data = [VecU32Struct((0..size as u32).into_iter().collect())]; + let data = [VecU32Struct((0..size as u32).collect())]; b.iter(|| { - let _: Box = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); + let _: ArrayRef = TryIntoArrow::try_into_arrow(black_box(&data)).unwrap(); }); }); } @@ -60,7 +63,7 @@ pub fn bench_buffer_deserialize(c: &mut Criterion) { for size in [1, 10, 100, 1000, 10000].iter() { group.throughput(Throughput::Elements(*size as u64)); group.bench_with_input(BenchmarkId::new("BufferU8", size), size, |b, &size| { - let data: Box = [BufU8Struct((0..size as u8).into_iter().collect())] + let data: ArrayRef = [BufU8Struct((0..size as u8).collect())] .try_into_arrow() .unwrap(); b.iter_batched( @@ -73,7 +76,7 @@ pub fn bench_buffer_deserialize(c: &mut Criterion) { ) }); group.bench_with_input(BenchmarkId::new("VecU8", size), size, |b, &size| { - let data: Box = [VecU8Struct((0..size as u8).into_iter().collect())] + let data: ArrayRef = [VecU8Struct((0..size as u8).collect())] .try_into_arrow() .unwrap(); b.iter_batched( @@ -86,7 +89,7 @@ pub fn bench_buffer_deserialize(c: &mut Criterion) { ); }); group.bench_with_input(BenchmarkId::new("BufferU32", size), size, |b, &size| { - let data: Box = [BufU32Struct((0..size as u32).into_iter().collect())] + let data: ArrayRef = [BufU32Struct((0..size as u32).collect())] .try_into_arrow() .unwrap(); b.iter_batched( @@ -99,7 +102,7 @@ pub fn bench_buffer_deserialize(c: &mut Criterion) { ) }); group.bench_with_input(BenchmarkId::new("VecU32", size), size, |b, &size| { - let data: Box = [VecU32Struct((0..size as u32).into_iter().collect())] + let data: ArrayRef = [VecU32Struct((0..size as u32).collect())] .try_into_arrow() .unwrap(); b.iter_batched( diff --git a/arrow_convert/src/deserialize/mod.rs b/arrow_convert/src/deserialize/mod.rs new file mode 100644 index 0000000..3229f43 --- /dev/null +++ b/arrow_convert/src/deserialize/mod.rs @@ -0,0 +1,530 @@ +//! Implementation and traits for deserializing from Arrow. + +use std::sync::Arc; + +use arrow::{ + array::*, + buffer::{Buffer, ScalarBuffer}, + datatypes::{self, ArrowNativeType, ArrowPrimitiveType, Decimal128Type}, +}; +use chrono::{NaiveDate, NaiveDateTime}; + +use crate::field::*; + +/// Implement by Arrow arrays that can be converted to an iterator +pub trait IntoArrowArrayIterator { + /// The type of the iterator item + type Item; + /// The type of the iterator + type IntoIter: Iterator; + /// Convert the array to an iterator + fn into_iter(self) -> Self::IntoIter; +} + +impl<'a, T: ArrowPrimitiveType> IntoArrowArrayIterator for &'a PrimitiveArray { + type Item = Option; + + type IntoIter = PrimitiveIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + ::into_iter(self) + } +} + +/// Implemented by [`ArrowField`] that can be deserialized from arrow +pub trait ArrowDeserialize: ArrowField + Sized +where + Self::ArrayType: ArrowArray, + for<'a> &'a Self::ArrayType: IntoArrowArrayIterator, +{ + /// The `arrow::Array` type corresponding to this field + type ArrayType; + + /// Deserialize this field from arrow + fn arrow_deserialize( + v: <&Self::ArrayType as IntoArrowArrayIterator>::Item, + ) -> Option<::Type>; + + #[inline] + #[doc(hidden)] + /// For internal use only + /// + /// This is an ugly hack to allow generating a blanket Option deserialize. + /// Ideally we would be able to capture the optional field of the iterator via + /// something like for<'a> &'a T::ArrayType: IntoArrowArrayIterator>, + /// However, the E parameter seems to confuse the borrow checker if it's a reference. + fn arrow_deserialize_internal( + v: <&Self::ArrayType as IntoArrowArrayIterator>::Item, + ) -> ::Type { + Self::arrow_deserialize(v).unwrap() + } +} + +/// Internal trait used to support deserialization and iteration of structs, and nested struct lists +/// +/// Trivial pass-thru implementations are provided for arrow arrays that implement IntoArrowArrayIterator. +/// +/// The derive macro generates implementations for typed struct arrays. +#[doc(hidden)] +pub trait ArrowArray +where + for<'a> &'a Self: IntoArrowArrayIterator, +{ + type BaseArrayType: Array; + + // Returns a typed iterator to the underlying elements of the array from an untyped Array reference. + fn iter_from_array_ref(b: &dyn Array) -> <&Self as IntoArrowArrayIterator>::IntoIter; +} + +// Macro to facilitate implementation for numeric types and numeric arrays. +macro_rules! impl_arrow_deserialize_primitive { + ($physical_type:ty, $primitive_type:ty) => { + impl ArrowDeserialize for $physical_type { + type ArrayType = PrimitiveArray<$primitive_type>; + + #[inline] + fn arrow_deserialize<'a>( + v: Option<<$primitive_type as ArrowPrimitiveType>::Native>, + ) -> Option { + v + } + } + + impl_arrow_array!(PrimitiveArray<$primitive_type>); + }; +} + +macro_rules! impl_arrow_array { + ($array:ty) => { + impl ArrowArray for $array { + type BaseArrayType = Self; + + #[inline] + fn iter_from_array_ref(b: &dyn Array) -> <&Self as IntoArrowArrayIterator>::IntoIter { + let b = b.as_any().downcast_ref::().unwrap(); + <&Self as IntoArrowArrayIterator>::into_iter(b) + } + } + }; +} + +// blanket implementation for optional fields +impl ArrowDeserialize for Option +where + T: ArrowDeserialize, + T::ArrayType: 'static + ArrowArray, + for<'a> &'a T::ArrayType: IntoArrowArrayIterator, +{ + type ArrayType = ::ArrayType; + + #[inline] + fn arrow_deserialize( + v: <&Self::ArrayType as IntoArrowArrayIterator>::Item, + ) -> Option<::Type> { + Self::arrow_deserialize_internal(v).map(Some) + } + + #[inline] + fn arrow_deserialize_internal( + v: <&Self::ArrayType as IntoArrowArrayIterator>::Item, + ) -> ::Type { + ::arrow_deserialize(v) + } +} + +impl_arrow_deserialize_primitive!(u8, datatypes::UInt8Type); +impl_arrow_deserialize_primitive!(u16, datatypes::UInt16Type); +impl_arrow_deserialize_primitive!(u32, datatypes::UInt32Type); +impl_arrow_deserialize_primitive!(u64, datatypes::UInt64Type); +impl_arrow_deserialize_primitive!(i8, datatypes::Int8Type); +impl_arrow_deserialize_primitive!(i16, datatypes::Int16Type); +impl_arrow_deserialize_primitive!(i32, datatypes::Int32Type); +impl_arrow_deserialize_primitive!(i64, datatypes::Int64Type); +impl_arrow_deserialize_primitive!(half::f16, datatypes::Float16Type); +impl_arrow_deserialize_primitive!(f32, datatypes::Float32Type); +impl_arrow_deserialize_primitive!(f64, datatypes::Float64Type); + +impl ArrowDeserialize for I128 { + type ArrayType = PrimitiveArray; + + #[inline] + fn arrow_deserialize<'a>(v: Option) -> Option { + v + } +} + +impl_arrow_array!(PrimitiveArray); + +impl<'a, OffsetSize: OffsetSizeTrait> IntoArrowArrayIterator + for &'a GenericStringArray +{ + type Item = Option<&'a str>; + + type IntoIter = ArrayIter<&'a GenericStringArray>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} +impl ArrowDeserialize for String { + type ArrayType = StringArray; + + #[inline] + fn arrow_deserialize(v: Option<&str>) -> Option { + v.map(|t| t.to_string()) + } +} + +impl ArrowDeserialize for LargeString { + type ArrayType = LargeStringArray; + + #[inline] + fn arrow_deserialize(v: Option<&str>) -> Option { + v.map(|t| t.to_string()) + } +} + +impl<'a> IntoArrowArrayIterator for &'a BooleanArray { + type Item = Option; + + type IntoIter = BooleanIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl ArrowDeserialize for bool { + type ArrayType = BooleanArray; + + #[inline] + fn arrow_deserialize(v: Option) -> Option { + v + } +} + +impl ArrowDeserialize for NaiveDateTime { + type ArrayType = TimestampNanosecondArray; + + #[inline] + fn arrow_deserialize(v: Option) -> Option { + v.and_then(arrow::temporal_conversions::timestamp_ns_to_datetime) + } +} + +impl ArrowDeserialize for NaiveDate { + type ArrayType = Date32Array; + + #[inline] + fn arrow_deserialize(v: Option) -> Option { + v.and_then(|t| arrow::temporal_conversions::as_date::(t as i64)) + } +} + +/// Iterator for for [`BufferBinaryArray`] +pub struct BufferBinaryArrayIter<'a> { + index: usize, + array: &'a BinaryArray, +} + +impl<'a> Iterator for BufferBinaryArrayIter<'a> { + type Item = Option<&'a [u8]>; + + fn next(&mut self) -> Option { + if self.index >= self.array.len() { + None + } else if self.array.is_valid(self.index) { + // self.array.iter + let value = self.array.value(self.index); + self.index += 1; + Some(Some(value)) + } else { + self.index += 1; + Some(None) + } + } +} + +/// Internal `ArrowArray` helper to iterate over a `BinaryArray` while exposing Buffer slices +pub struct BufferBinaryArray; + +impl<'a> IntoArrowArrayIterator for &'a BufferBinaryArray { + type Item = Option<&'a [u8]>; + + type IntoIter = BufferBinaryArrayIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + unimplemented!("Use iter_from_array_ref"); + } +} + +impl ArrowArray for BufferBinaryArray { + type BaseArrayType = BinaryArray; + #[inline] + fn iter_from_array_ref(a: &dyn Array) -> <&Self as IntoArrowArrayIterator>::IntoIter { + let b = a.as_any().downcast_ref::().unwrap(); + + BufferBinaryArrayIter { index: 0, array: b } + } +} + +// Treat both Buffer and ScalarBuffer the same +impl ArrowDeserialize for Buffer { + type ArrayType = BufferBinaryArray; + + #[inline] + fn arrow_deserialize(v: Option<&[u8]>) -> Option { + v.map(|t| t.into()) + } +} +impl ArrowDeserialize for ScalarBuffer { + type ArrayType = BufferBinaryArray; + + #[inline] + fn arrow_deserialize(v: Option<&[u8]>) -> Option { + v.map(|t| ScalarBuffer::from(t.to_vec())) + } +} + +impl ArrowDeserialize for Vec { + type ArrayType = BinaryArray; + + #[inline] + fn arrow_deserialize(v: Option<&[u8]>) -> Option { + v.map(|t| t.to_vec()) + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> IntoArrowArrayIterator + for &'a GenericBinaryArray +{ + type Item = Option<&'a [u8]>; + + type IntoIter = ArrayIter<&'a GenericBinaryArray>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl<'a> IntoArrowArrayIterator for &'a FixedSizeBinaryArray { + type Item = Option<&'a [u8]>; + + type IntoIter = FixedSizeBinaryIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl ArrowDeserialize for LargeBinary { + type ArrayType = LargeBinaryArray; + + #[inline] + fn arrow_deserialize(v: Option<&[u8]>) -> Option> { + v.map(|t| t.to_vec()) + } +} + +impl ArrowDeserialize for FixedSizeBinary { + type ArrayType = FixedSizeBinaryArray; + + #[inline] + fn arrow_deserialize(v: Option<&[u8]>) -> Option> { + v.map(|t| t.to_vec()) + } +} + +fn arrow_deserialize_vec_helper( + v: Option>, +) -> Option< as ArrowField>::Type> +where + T: ArrowDeserialize + ArrowEnableVecForType + 'static, + for<'a> &'a T::ArrayType: IntoArrowArrayIterator, +{ + use std::ops::Deref; + v.map(|t| { + arrow_array_deserialize_iterator_internal::<::Type, T>(t.deref()) + .collect::::Type>>() + }) +} + +// Blanket implementation for ScalarBuffer +impl ArrowDeserialize for ScalarBuffer +where + K: ArrowPrimitiveType, + T: ArrowDeserialize> + ArrowNativeType + ArrowEnableVecForType, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + type ArrayType = ListArray; + + #[inline] + fn arrow_deserialize( + v: <&Self::ArrayType as IntoArrowArrayIterator>::Item, + ) -> Option<::Type> { + let t = v?; + let array = t + .as_any() + .downcast_ref::>() + .unwrap() + .values() + .clone(); + Some(array) + } +} + +// Blanket implementation for Vec +impl ArrowDeserialize for Vec +where + T: ArrowDeserialize + ArrowEnableVecForType + 'static, + ::ArrayType: 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + type ArrayType = ListArray; + + fn arrow_deserialize(v: Option>) -> Option<::Type> { + arrow_deserialize_vec_helper::(v) + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> IntoArrowArrayIterator for &'a GenericListArray { + type Item = Option>; + + type IntoIter = GenericListArrayIter<'a, OffsetSize>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl<'a> IntoArrowArrayIterator for &'a FixedSizeListArray { + type Item = Option>; + + type IntoIter = FixedSizeListIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl ArrowDeserialize for LargeVec +where + T: ArrowDeserialize + ArrowEnableVecForType + 'static, + ::ArrayType: 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + type ArrayType = LargeListArray; + + fn arrow_deserialize(v: Option>) -> Option<::Type> { + arrow_deserialize_vec_helper::(v) + } +} + +impl ArrowDeserialize for FixedSizeVec +where + T: ArrowDeserialize + ArrowEnableVecForType + 'static, + ::ArrayType: 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + type ArrayType = FixedSizeListArray; + + fn arrow_deserialize(v: Option>) -> Option<::Type> { + arrow_deserialize_vec_helper::(v) + } +} + +impl_arrow_array!(BooleanArray); +impl_arrow_array!(StringArray); +impl_arrow_array!(LargeStringArray); +impl_arrow_array!(BinaryArray); +impl_arrow_array!(LargeBinaryArray); +impl_arrow_array!(FixedSizeBinaryArray); +impl_arrow_array!(ListArray); +impl_arrow_array!(LargeListArray); +impl_arrow_array!(FixedSizeListArray); +impl_arrow_array!(Date32Array); +impl_arrow_array!(TimestampNanosecondArray); + +/// Top-level API to deserialize from Arrow +pub trait TryIntoCollection +where + Element: ArrowField, + Collection: FromIterator, +{ + /// Convert from a `arrow::Array` to any collection that implements the `FromIterator` trait + fn try_into_collection(self) -> arrow::error::Result; + + /// Same as `try_into_collection` except can coerce the conversion to a specific Arrow type. This is + /// useful when the same rust type maps to one or more Arrow types for example `LargeString`. + fn try_into_collection_as_type(self) -> arrow::error::Result + where + ArrowType: ArrowDeserialize + ArrowField + 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator; +} + +/// Helper to return an iterator for elements from a [`arrow::array::Array`]. +fn arrow_array_deserialize_iterator_internal<'a, Element, Field>( + b: &'a dyn arrow::array::Array, +) -> impl Iterator + 'a +where + Field: ArrowDeserialize + ArrowField + 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + <::ArrayType as ArrowArray>::iter_from_array_ref(b) + .map(::arrow_deserialize_internal) +} + +/// Returns a typed iterator to a target type from an `arrow::Array` +pub fn arrow_array_deserialize_iterator_as_type<'a, Element, ArrowType>( + arr: &'a dyn arrow::array::Array, +) -> arrow::error::Result + 'a> +where + Element: 'static, + ArrowType: ArrowDeserialize + ArrowField + 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + if &::data_type() != arr.data_type() { + Err(arrow::error::ArrowError::InvalidArgumentError( + "Data type mismatch".to_string(), + )) + } else { + Ok(arrow_array_deserialize_iterator_internal::< + Element, + ArrowType, + >(arr)) + } +} + +/// Return an iterator that deserializes an [`Array`] to an element of type T +pub fn arrow_array_deserialize_iterator<'a, T>( + arr: &'a dyn arrow::array::Array, +) -> arrow::error::Result + 'a> +where + T: ArrowDeserialize + ArrowField + 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, +{ + arrow_array_deserialize_iterator_as_type::(arr) +} + +impl TryIntoCollection for ArrowArray +where + Element: ArrowDeserialize + ArrowField + 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, + ArrowArray: std::borrow::Borrow, + Collection: FromIterator, +{ + fn try_into_collection(self) -> arrow::error::Result { + Ok(arrow_array_deserialize_iterator::(self.borrow())?.collect()) + } + + fn try_into_collection_as_type(self) -> arrow::error::Result + where + ArrowType: ArrowDeserialize + ArrowField + 'static, + for<'b> &'b ::ArrayType: IntoArrowArrayIterator, + { + Ok( + arrow_array_deserialize_iterator_as_type::(self.borrow())? + .collect(), + ) + } +} diff --git a/arrow2_convert/src/field.rs b/arrow_convert/src/field.rs similarity index 60% rename from arrow2_convert/src/field.rs rename to arrow_convert/src/field.rs index c182233..7f6815e 100644 --- a/arrow2_convert/src/field.rs +++ b/arrow_convert/src/field.rs @@ -1,15 +1,16 @@ //! Implementation and traits for mapping rust types to Arrow types -use arrow2::{ - buffer::Buffer, - datatypes::{DataType, Field}, - types::NativeType, +use std::sync::Arc; + +use arrow::{ + buffer::{Buffer, ScalarBuffer}, + datatypes::{ArrowNativeType, DataType, Field}, }; use chrono::{NaiveDate, NaiveDateTime}; /// Trait implemented by all types that can be used as an Arrow field. /// -/// Implementations are provided for types already supported by the arrow2 crate: +/// Implementations are provided for types already supported by the arrow crate: /// - numeric types: [`u8`], [`u16`], [`u32`], [`u64`], [`i8`], [`i16`], [`i32`], [`i128`], [`i64`], [`f32`], [`f64`], /// - other types: [`bool`], [`String`] /// - temporal types: [`chrono::NaiveDate`], [`chrono::NaiveDateTime`] @@ -26,13 +27,17 @@ pub trait ArrowField { /// type is LargeString, this should be String. type Type; + /// the Native Arrow data type backing this field. This should be `Self::Type` in most cases, + /// with the exception of `NaiveDate` + type Native; + /// The [`DataType`] fn data_type() -> DataType; #[inline] #[doc(hidden)] /// For internal use and not meant to be reimplemented. - /// returns the [`arrow2::datatypes::Field`] for this field + /// returns the [`arrow::datatypes::Field`] for this field fn field(name: &str) -> Field { Field::new(name, Self::data_type(), Self::is_nullable()) } @@ -67,10 +72,11 @@ macro_rules! impl_numeric_type { ($physical_type:ty, $logical_type:ident) => { impl ArrowField for $physical_type { type Type = $physical_type; + type Native = $physical_type; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::$logical_type + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::$logical_type } } }; @@ -89,9 +95,10 @@ where T: ArrowField, { type Type = Option<::Type>; + type Native = Option<::Native>; #[inline] - fn data_type() -> arrow2::datatypes::DataType { + fn data_type() -> arrow::datatypes::DataType { ::data_type() } @@ -110,28 +117,30 @@ impl_numeric_type_full!(i8, Int8); impl_numeric_type_full!(i16, Int16); impl_numeric_type_full!(i32, Int32); impl_numeric_type_full!(i64, Int64); -impl_numeric_type_full!(arrow2::types::f16, Float16); +impl_numeric_type_full!(half::f16, Float16); impl_numeric_type_full!(f32, Float32); impl_numeric_type_full!(f64, Float64); /// Maps a rust i128 to an Arrow Decimal where precision and scale are required. -pub struct I128 {} +pub struct I128 {} -impl ArrowField for I128 { +impl ArrowField for I128 { type Type = i128; + type Native = i128; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Decimal(PRECISION, SCALE) + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Decimal128(PRECISION, SCALE) } } impl ArrowField for String { type Type = String; + type Native = String; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Utf8 + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Utf8 } } @@ -140,55 +149,71 @@ pub struct LargeString {} impl ArrowField for LargeString { type Type = String; + type Native = String; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::LargeUtf8 + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::LargeUtf8 } } impl ArrowField for bool { type Type = Self; + type Native = Self; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Boolean + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Boolean } } impl ArrowField for NaiveDateTime { type Type = Self; + type Native = i64; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Timestamp(arrow2::datatypes::TimeUnit::Nanosecond, None) + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Nanosecond, None) } } impl ArrowField for NaiveDate { type Type = Self; + type Native = i32; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Date32 + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Date32 } } -impl ArrowField for Buffer { +// Treat both Buffer and ScalarBuffer the same +impl ArrowField for Buffer { type Type = Self; + type Native = Self; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Binary + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Binary + } +} +impl ArrowField for ScalarBuffer { + type Type = Self; + type Native = Self; + + #[inline] + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Binary } } impl ArrowField for Vec { type Type = Self; + type Native = Self; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Binary + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Binary } } @@ -197,35 +222,38 @@ pub struct LargeBinary {} impl ArrowField for LargeBinary { type Type = Vec; + type Native = Vec; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::LargeBinary + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::LargeBinary } } /// Represents the `FixedSizeBinary` Arrow type. -pub struct FixedSizeBinary {} +pub struct FixedSizeBinary {} -impl ArrowField for FixedSizeBinary { +impl ArrowField for FixedSizeBinary { type Type = Vec; + type Native = Vec; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::FixedSizeBinary(SIZE) + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::FixedSizeBinary(SIZE) } } // Blanket implementation for Buffer -impl ArrowField for Buffer +impl ArrowField for ScalarBuffer where - T: ArrowField + NativeType + ArrowEnableVecForType, + T: ArrowField + ArrowNativeType + ArrowEnableVecForType, { type Type = Self; + type Native = Self; #[inline] fn data_type() -> DataType { - DataType::List(Box::new(::field("item"))) + DataType::List(Arc::new(::field("item"))) } } @@ -235,10 +263,11 @@ where T: ArrowField + ArrowEnableVecForType, { type Type = Vec<::Type>; + type Native = Vec<::Native>; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::List(Box::new(::field("item"))) + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::List(Arc::new(::field("item"))) } } @@ -252,27 +281,30 @@ where T: ArrowField + ArrowEnableVecForType, { type Type = Vec<::Type>; + type Native = Vec<::Native>; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::LargeList(Box::new(::field("item"))) + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::LargeList(Arc::new(::field("item"))) } } /// Represents the `FixedSizeList` Arrow type. -pub struct FixedSizeVec { +pub struct FixedSizeVec { d: std::marker::PhantomData, } -impl ArrowField for FixedSizeVec +impl ArrowField for FixedSizeVec where T: ArrowField + ArrowEnableVecForType, { type Type = Vec<::Type>; + type Native = Vec<::Native>; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::FixedSizeList(Box::new(::field("item")), SIZE) + fn data_type() -> arrow::datatypes::DataType { + let field = Field::new("item", ::data_type(), true); + arrow::datatypes::DataType::FixedSizeList(Arc::new(field), SIZE) } } @@ -282,19 +314,23 @@ arrow_enable_vec_for_type!(bool); arrow_enable_vec_for_type!(NaiveDateTime); arrow_enable_vec_for_type!(NaiveDate); arrow_enable_vec_for_type!(Vec); -arrow_enable_vec_for_type!(Buffer); +arrow_enable_vec_for_type!(Buffer); +arrow_enable_vec_for_type!(ScalarBuffer); arrow_enable_vec_for_type!(LargeBinary); -impl ArrowEnableVecForType for FixedSizeBinary {} -impl ArrowEnableVecForType for I128 {} +impl ArrowEnableVecForType for FixedSizeBinary {} +impl ArrowEnableVecForType for I128 {} // Blanket implementation for Vec> if vectors are enabled for T impl ArrowEnableVecForType for Option where T: ArrowField + ArrowEnableVecForType {} // Blanket implementation for Vec> and Vec> if vectors or buffers are enabled for T impl ArrowEnableVecForType for Vec where T: ArrowField + ArrowEnableVecForType {} -impl ArrowEnableVecForType for Buffer where T: ArrowField + ArrowEnableVecForType {} +impl ArrowEnableVecForType for ScalarBuffer where + T: ArrowField + ArrowEnableVecForType + ArrowNativeType +{ +} impl ArrowEnableVecForType for LargeVec where T: ArrowField + ArrowEnableVecForType {} -impl ArrowEnableVecForType for FixedSizeVec where +impl ArrowEnableVecForType for FixedSizeVec where T: ArrowField + ArrowEnableVecForType { } diff --git a/arrow2_convert/src/lib.rs b/arrow_convert/src/lib.rs similarity index 81% rename from arrow2_convert/src/lib.rs rename to arrow_convert/src/lib.rs index 01169e9..3244e51 100644 --- a/arrow2_convert/src/lib.rs +++ b/arrow_convert/src/lib.rs @@ -9,9 +9,9 @@ pub mod serialize; // The proc macro is implemented in derive_internal, and re-exported by this // crate. This is because a single crate can not define both a proc macro and a // macro_rules macro. -#[cfg(feature = "arrow2_convert_derive")] +#[cfg(feature = "arrow_convert_derive")] #[doc(hidden)] -pub use arrow2_convert_derive::{ArrowDeserialize, ArrowField, ArrowSerialize}; +pub use arrow_convert_derive::{ArrowDeserialize, ArrowField, ArrowSerialize}; // Test README with doctests #[cfg_attr(not(target_os = "windows"), doc = include_str!("../README.md"))] diff --git a/arrow_convert/src/serialize/mod.rs b/arrow_convert/src/serialize/mod.rs new file mode 100644 index 0000000..9beac79 --- /dev/null +++ b/arrow_convert/src/serialize/mod.rs @@ -0,0 +1,510 @@ +//! Implementation and traits for serializing to Arrow. + +use arrow::datatypes::{ArrowNativeType, Field}; +use arrow::{array::*, datatypes}; +// use arrow::datatypes::ArrowNativeType; +use arrow::buffer::{Buffer, ScalarBuffer}; +use chrono::{NaiveDate, NaiveDateTime}; +use std::sync::Arc; + +mod push_null; +pub use push_null::*; + +use crate::field::*; + +/// Trait that is implemented by all types that are serializable to Arrow. +/// +/// Implementations are provided for all built-in arrow types as well as Vec, and Option +/// if T implements ArrowSerialize. +/// +/// Note that Vec implementation needs to be enabled by the [`crate::arrow_enable_vec_for_type`] macro. +pub trait ArrowSerialize: ArrowField { + /// The [`arrow::array::ArrayBuilder`] that holds this value + type ArrayBuilderType: arrow::array::ArrayBuilder; + + /// Create a new mutable array + fn new_array() -> Self::ArrayBuilderType; + + /// Serialize this field to arrow + fn arrow_serialize( + v: &::Type, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()>; +} + +// Macro to facilitate implementation of serializable traits for numeric types and numeric mutable arrays. +macro_rules! impl_numeric_type { + ($physical_type:ty, $primitive_type:ty) => { + impl ArrowSerialize for $physical_type { + type ArrayBuilderType = PrimitiveBuilder<$primitive_type>; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize( + v: &Self, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + array.append_option(Some(*v)); + Ok(()) + } + } + }; +} + +// blanket implementation for optional fields +impl ArrowSerialize for Option +where + T: ArrowSerialize, + T::ArrayBuilderType: PushNull, +{ + type ArrayBuilderType = ::ArrayBuilderType; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + ::new_array() + } + + #[inline] + fn arrow_serialize( + v: &::Type, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + match v.as_ref() { + Some(t) => ::arrow_serialize(t, array), + None => { + array.push_null(); + Ok(()) + } + } + } +} + +impl_numeric_type!(u8, datatypes::UInt8Type); +impl_numeric_type!(u16, datatypes::UInt16Type); +impl_numeric_type!(u32, datatypes::UInt32Type); +impl_numeric_type!(u64, datatypes::UInt64Type); +impl_numeric_type!(i8, datatypes::Int8Type); +impl_numeric_type!(i16, datatypes::Int16Type); +impl_numeric_type!(i32, datatypes::Int32Type); +impl_numeric_type!(i64, datatypes::Int64Type); +impl_numeric_type!(half::f16, datatypes::Float16Type); +impl_numeric_type!(f32, datatypes::Float32Type); +impl_numeric_type!(f64, datatypes::Float64Type); + +impl ArrowSerialize for I128 { + type ArrayBuilderType = PrimitiveBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default().with_data_type(::data_type()) + } + + #[inline] + fn arrow_serialize(v: &i128, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(*v)); + Ok(()) + } +} + +impl ArrowSerialize for String { + type ArrayBuilderType = StringBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(v)); + Ok(()) + } +} + +impl ArrowSerialize for LargeString { + type ArrayBuilderType = LargeStringBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize(v: &String, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(v)); + Ok(()) + } +} + +impl ArrowSerialize for bool { + type ArrayBuilderType = BooleanBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_value(*v); + Ok(()) + } +} + +impl ArrowSerialize for NaiveDateTime { + type ArrayBuilderType = TimestampNanosecondBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default().with_data_type(::data_type()) + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(v.and_utc().timestamp_nanos_opt()); + Ok(()) + } +} + +impl ArrowSerialize for NaiveDate { + type ArrayBuilderType = Date32Builder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default().with_data_type(::data_type()) + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some( + chrono::Datelike::num_days_from_ce(v) - arrow::temporal_conversions::EPOCH_DAYS_FROM_CE, + )); + Ok(()) + } +} + +// Treat both Buffer and ScalarBuffer the same +impl ArrowSerialize for Buffer { + type ArrayBuilderType = BinaryBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(v.as_slice())); + Ok(()) + } +} +impl ArrowSerialize for ScalarBuffer { + type ArrayBuilderType = BinaryBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(v)); + Ok(()) + } +} + +impl ArrowSerialize for Vec { + type ArrayBuilderType = BinaryBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(v)); + Ok(()) + } +} + +impl ArrowSerialize for LargeBinary { + type ArrayBuilderType = LargeBinaryBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() + } + + #[inline] + fn arrow_serialize( + v: &Vec, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + array.append_option(Some(v)); + Ok(()) + } +} + +impl ArrowSerialize for FixedSizeBinary { + type ArrayBuilderType = FixedSizeBinaryBuilder; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::new(SIZE) + } + + #[inline] + fn arrow_serialize( + v: &Vec, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + array.append_value(v) + } +} + +// Blanket implementation for Buffer +impl ArrowSerialize for ScalarBuffer +where + T: ArrowNativeType + ArrowSerialize + ArrowEnableVecForType + ArrowField, +{ + type ArrayBuilderType = ListBuilder<::ArrayBuilderType>; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + let field = Arc::new(Field::new( + "item", + ::data_type(), + ::is_nullable(), + )); + ListBuilder::new(::new_array()).with_field(field) + } + + #[inline] + fn arrow_serialize( + v: &::Type, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + let values = array.values(); + for i in v.iter() { + ::arrow_serialize(i, values)?; + } + array.append(true); + Ok(()) + } +} + +// Blanket implementation for Vec +impl ArrowSerialize for Vec +where + T: ArrowSerialize + ArrowEnableVecForType + 'static, + ::ArrayBuilderType: Default, +{ + type ArrayBuilderType = ListBuilder<::ArrayBuilderType>; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + let field = Arc::new(Field::new( + "item", + ::data_type(), + ::is_nullable(), + )); + ListBuilder::new(::new_array()).with_field(field) + } + + fn arrow_serialize( + v: &::Type, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + let values = array.values(); + for i in v.iter() { + ::arrow_serialize(i, values)?; + } + array.append(true); + Ok(()) + } +} + +impl ArrowSerialize for LargeVec +where + T: ArrowSerialize + ArrowEnableVecForType + 'static, + ::ArrayBuilderType: Default, +{ + type ArrayBuilderType = LargeListBuilder<::ArrayBuilderType>; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + let field = Arc::new(Field::new( + "item", + ::data_type(), + ::is_nullable(), + )); + Self::ArrayBuilderType::new(::new_array()).with_field(field) + } + + fn arrow_serialize( + v: &::Type, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + let values = array.values(); + for i in v.iter() { + ::arrow_serialize(i, values)?; + } + array.append(true); + Ok(()) + } +} + +impl ArrowSerialize for FixedSizeVec +where + T: ArrowSerialize + ArrowEnableVecForType + 'static, + ::ArrayBuilderType: Default, +{ + type ArrayBuilderType = FixedSizeListBuilder<::ArrayBuilderType>; + + #[inline] + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::new(::new_array(), SIZE) + } + + fn arrow_serialize( + v: &::Type, + array: &mut Self::ArrayBuilderType, + ) -> arrow::error::Result<()> { + let values = array.values(); + for i in v.iter() { + ::arrow_serialize(i, values)?; + } + array.append(true); + Ok(()) + } +} + +// internal helper method to extend a mutable array +fn arrow_serialize_extend_internal< + 'a, + A: 'static, + T: ArrowSerialize + ArrowField + 'static, + I: IntoIterator, +>( + into_iter: I, + array: &mut ::ArrayBuilderType, +) -> arrow::error::Result<()> { + let iter = into_iter.into_iter(); + for i in iter { + ::arrow_serialize(i, array)?; + } + Ok(()) +} + +/// Serializes an iterator into an `arrow::ArrayBuilder` +pub fn arrow_serialize_to_mutable_array< + 'a, + A: 'static, + T: ArrowSerialize + ArrowField + 'static, + I: IntoIterator, +>( + into_iter: I, +) -> arrow::error::Result<::ArrayBuilderType> { + let mut arr = ::new_array(); + arrow_serialize_extend_internal::(into_iter, &mut arr)?; + Ok(arr) +} + +/// API to flatten a RecordBatch consisting of an `arrow::array::StructArray` into a `RecordBatch` consisting of `arrow::array::Array`s contained by the `StructArray` +pub trait FlattenRecordBatch { + /// Convert an `arrow::record_batch::RecordBatch` containing a `arrow::array::StructArray` to an `arrow::record_batch::RecordBatch` consisting of the + /// `arrow::array::Array`s contained by the `StructArray` by consuming the + /// original `RecordBatch`. Returns an error if the `RecordBatch` cannot be flattened. + fn flatten(self) -> Result; +} + +impl FlattenRecordBatch for RecordBatch { + fn flatten(self) -> Result { + let arrays = self.columns(); + + // we only support flattening of a RecordBatch containing a single StructArray + if arrays.len() != 1 { + return Err(arrow::error::ArrowError::InvalidArgumentError( + "RecordBatch must contain a single Array".to_string(), + )); + } + + let array = &arrays[0]; + + let data_type = array.as_ref().data_type(); + if !matches!(data_type, arrow::datatypes::DataType::Struct(_)) { + return Err(arrow::error::ArrowError::InvalidArgumentError( + "Array in RecordBatch must be of type arrow::datatypes::PhysicalType::Struct" + .to_string(), + )); + } + + let struct_array = array + .as_ref() + .as_any() + .downcast_ref::() + .unwrap(); + Ok(RecordBatch::from(struct_array)) + } +} + +/// Top-level API to serialize to Arrow +pub trait TryIntoArrow<'a, ArrowArray, Element> +where + Self: IntoIterator, + Element: 'static, +{ + /// Convert from any iterable collection into an `arrow::Array` + fn try_into_arrow(self) -> arrow::error::Result; + + /// Convert from any iterable collection into an `arrow::Array` by coercing the conversion to a specific Arrow type. + /// This is useful when the same rust type maps to one or more Arrow types for example `LargeString`. + fn try_into_arrow_as_type(self) -> arrow::error::Result + where + ArrowType: ArrowSerialize + ArrowField + 'static; +} + +impl<'a, Element, Collection> TryIntoArrow<'a, ArrayRef, Element> for Collection +where + Element: ArrowSerialize + ArrowField + 'static, + Collection: IntoIterator, +{ + fn try_into_arrow(self) -> arrow::error::Result { + Ok(arrow_serialize_to_mutable_array::(self)?.finish()) + } + + fn try_into_arrow_as_type(self) -> arrow::error::Result + where + Field: ArrowSerialize + ArrowField + 'static, + { + Ok(arrow_serialize_to_mutable_array::(self)?.finish()) + } +} + +impl<'a, Element, Collection> TryIntoArrow<'a, RecordBatch, Element> for Collection +where + Element: ArrowSerialize + ArrowField + 'static, + Collection: IntoIterator, +{ + fn try_into_arrow(self) -> arrow::error::Result { + RecordBatch::try_from_iter([( + "record_batch_item", + arrow_serialize_to_mutable_array::(self)?.finish(), + )]) + } + + fn try_into_arrow_as_type(self) -> arrow::error::Result + where + Field: ArrowSerialize + ArrowField + 'static, + { + RecordBatch::try_from_iter([( + "record_batch_item", + arrow_serialize_to_mutable_array::(self)?.finish(), + )]) + } +} diff --git a/arrow_convert/src/serialize/push_null.rs b/arrow_convert/src/serialize/push_null.rs new file mode 100644 index 0000000..85a66d4 --- /dev/null +++ b/arrow_convert/src/serialize/push_null.rs @@ -0,0 +1,85 @@ +use arrow::{ + array::{ + ArrayBuilder, BinaryBuilder, BooleanBufferBuilder, BooleanBuilder, FixedSizeBinaryBuilder, + FixedSizeListBuilder, LargeBinaryBuilder, LargeListBuilder, LargeStringBuilder, + ListBuilder, PrimitiveBuilder, StringBuilder, + }, + datatypes::ArrowPrimitiveType, +}; + +/// Trait for appending null values to an array builder. +pub trait PushNull { + /// Push a null value to the array builder. + fn push_null(&mut self); +} + +impl PushNull for FixedSizeListBuilder { + fn push_null(&mut self) { + let length = self.value_length(); + let values = self.values(); + for _ in 0..length { + values.push_null(); + } + self.append(false) + } +} + +impl PushNull for BinaryBuilder { + fn push_null(&mut self) { + BinaryBuilder::append_null(self); + } +} + +impl PushNull for LargeBinaryBuilder { + fn push_null(&mut self) { + LargeBinaryBuilder::append_null(self); + } +} + +impl PushNull for FixedSizeBinaryBuilder { + fn push_null(&mut self) { + FixedSizeBinaryBuilder::append_null(self); + } +} + +impl PushNull for LargeStringBuilder { + fn push_null(&mut self) { + LargeStringBuilder::append_null(self); + } +} + +impl PushNull for StringBuilder { + fn push_null(&mut self) { + StringBuilder::append_null(self); + } +} + +impl PushNull for ListBuilder { + fn push_null(&mut self) { + ListBuilder::::append_null(self); + } +} + +impl PushNull for LargeListBuilder { + fn push_null(&mut self) { + LargeListBuilder::::append_null(self); + } +} + +impl PushNull for PrimitiveBuilder { + fn push_null(&mut self) { + PrimitiveBuilder::::append_null(self); + } +} + +impl PushNull for BooleanBuilder { + fn push_null(&mut self) { + BooleanBuilder::append_null(self) + } +} + +impl PushNull for BooleanBufferBuilder { + fn push_null(&mut self) { + BooleanBufferBuilder::append(self, false) + } +} diff --git a/arrow2_convert/tests/complex_example.rs b/arrow_convert/tests/complex_example.rs similarity index 69% rename from arrow2_convert/tests/complex_example.rs rename to arrow_convert/tests/complex_example.rs index b572c49..f2ddd0a 100644 --- a/arrow2_convert/tests/complex_example.rs +++ b/arrow_convert/tests/complex_example.rs @@ -1,11 +1,12 @@ -use arrow2::array::*; -use arrow2_convert::deserialize::{arrow_array_deserialize_iterator, TryIntoCollection}; -use arrow2_convert::serialize::TryIntoArrow; +use arrow::array::*; +use arrow_convert::deserialize::{arrow_array_deserialize_iterator, TryIntoCollection}; +use arrow_convert::serialize::TryIntoArrow; /// Complex example that uses the following features: /// /// - Deeply Nested structs and lists /// - Custom types -use arrow2_convert::{ArrowDeserialize, ArrowField, ArrowSerialize}; +use arrow_convert::{ArrowDeserialize, ArrowField, ArrowSerialize}; +use chrono::DateTime; use std::borrow::Borrow; #[derive(Debug, Clone, PartialEq, ArrowField, ArrowSerialize, ArrowDeserialize)] @@ -39,19 +40,19 @@ pub struct Root { // int 32 array int32_array: Vec, // large binary - #[arrow_field(type = "arrow2_convert::field::LargeBinary")] + #[arrow_field(type = "arrow_convert::field::LargeBinary")] large_binary: Vec, // fixed size binary - #[arrow_field(type = "arrow2_convert::field::FixedSizeBinary<3>")] + #[arrow_field(type = "arrow_convert::field::FixedSizeBinary<3>")] fixed_size_binary: Vec, // large string - #[arrow_field(type = "arrow2_convert::field::LargeString")] + #[arrow_field(type = "arrow_convert::field::LargeString")] large_string: String, // large vec - #[arrow_field(type = "arrow2_convert::field::LargeVec")] + #[arrow_field(type = "arrow_convert::field::LargeVec")] large_vec: Vec, // fixed size vec - #[arrow_field(type = "arrow2_convert::field::FixedSizeVec")] + #[arrow_field(type = "arrow_convert::field::FixedSizeVec")] fixed_size_vec: Vec, } @@ -78,47 +79,45 @@ pub struct CustomType(u64); /// - ArrowField /// - ArrowSerialize /// - ArrowDeserialize -impl arrow2_convert::field::ArrowField for CustomType { +impl arrow_convert::field::ArrowField for CustomType { type Type = Self; + type Native = u64; #[inline] - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Extension( - "custom".to_string(), - Box::new(arrow2::datatypes::DataType::UInt64), - None, - ) + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::UInt64 } } -impl arrow2_convert::serialize::ArrowSerialize for CustomType { - type MutableArrayType = arrow2::array::MutablePrimitiveArray; +impl arrow_convert::serialize::ArrowSerialize for CustomType { + type ArrayBuilderType = arrow::array::UInt64Builder; #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::from(::data_type()) + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() } #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - array.try_push(Some(v.0)) + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + array.append_option(Some(v.0)); + Ok(()) } } -impl arrow2_convert::deserialize::ArrowDeserialize for CustomType { - type ArrayType = arrow2::array::PrimitiveArray; +impl arrow_convert::deserialize::ArrowDeserialize for CustomType { + type ArrayType = arrow::array::UInt64Array; #[inline] - fn arrow_deserialize(v: Option<&u64>) -> Option { - v.map(|t| CustomType(*t)) + fn arrow_deserialize(v: Option) -> Option { + v.map(CustomType) } } // enable Vec -arrow2_convert::arrow_enable_vec_for_type!(CustomType); +arrow_convert::arrow_enable_vec_for_type!(CustomType); fn item1() -> Root { - use chrono::{NaiveDate, NaiveDateTime}; + use chrono::NaiveDate; Root { name: Some("a".to_string()), @@ -127,11 +126,11 @@ fn item1() -> Root { a2: 1, a3: Some(b"aa".to_vec()), a4: NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(), - a5: NaiveDateTime::from_timestamp_opt(10000, 0).unwrap(), - a6: Some(NaiveDateTime::from_timestamp_opt(10001, 0)).unwrap(), + a5: DateTime::from_timestamp(10000, 0).unwrap().naive_local(), + a6: DateTime::from_timestamp(10001, 0).map(|dt| dt.naive_local()), date_time_list: vec![ - NaiveDateTime::from_timestamp_opt(10000, 10).unwrap(), - NaiveDateTime::from_timestamp_opt(10000, 11).unwrap(), + DateTime::from_timestamp(10000, 10).unwrap().naive_local(), + DateTime::from_timestamp(10000, 11).unwrap().naive_local(), ], nullable_list: Some(vec![Some("cc".to_string()), Some("dd".to_string())]), required_list: vec![Some("aa".to_string()), Some("bb".to_string())], @@ -164,7 +163,7 @@ fn item1() -> Root { } fn item2() -> Root { - use chrono::{NaiveDate, NaiveDateTime}; + use chrono::NaiveDate; Root { name: Some("b".to_string()), @@ -173,11 +172,11 @@ fn item2() -> Root { a2: 1, a3: Some(b"aa".to_vec()), a4: NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(), - a5: NaiveDateTime::from_timestamp_opt(10000, 0).unwrap(), + a5: DateTime::from_timestamp(10000, 0).unwrap().naive_local(), a6: None, date_time_list: vec![ - NaiveDateTime::from_timestamp_opt(10000, 10).unwrap(), - NaiveDateTime::from_timestamp_opt(10000, 11).unwrap(), + DateTime::from_timestamp(10000, 10).unwrap().naive_local(), + DateTime::from_timestamp(10000, 11).unwrap().naive_local(), ], nullable_list: None, required_list: vec![Some("ee".to_string()), Some("ff".to_string())], @@ -210,18 +209,18 @@ fn item2() -> Root { } #[test] -fn test_round_trip() -> arrow2::error::Result<()> { +fn test_round_trip() -> arrow::error::Result<()> { // serialize to an arrow array let original_array = [item1(), item2()]; - let array: Box = original_array.try_into_arrow()?; + let array: ArrayRef = original_array.try_into_arrow()?; let struct_array = array .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(struct_array.len(), 2); - let values = struct_array.values(); + let values = struct_array.columns(); assert_eq!(values.len(), 21); assert_eq!(struct_array.len(), 2); diff --git a/arrow2_convert/tests/simple_example.rs b/arrow_convert/tests/simple_example.rs similarity index 78% rename from arrow2_convert/tests/simple_example.rs rename to arrow_convert/tests/simple_example.rs index c7c3963..84b7860 100644 --- a/arrow2_convert/tests/simple_example.rs +++ b/arrow_convert/tests/simple_example.rs @@ -1,10 +1,13 @@ +use std::sync::Arc; + /// Simple example -use arrow2::array::Array; -use arrow2_convert::{ +use arrow::array::Array; +use arrow_convert::{ deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowDeserialize, ArrowField, ArrowSerialize, }; +// #[derive(Debug, Clone, PartialEq, Eq, ArrowField, ArrowDeserialize)] #[derive(Debug, Clone, PartialEq, Eq, ArrowField, ArrowSerialize, ArrowDeserialize)] pub struct Foo { name: String, @@ -12,7 +15,6 @@ pub struct Foo { #[test] fn test_simple_roundtrip() { - // an item let original_array = [ Foo { name: "hello".to_string(), @@ -26,13 +28,13 @@ fn test_simple_roundtrip() { ]; // serialize to an arrow array. try_into_arrow() is enabled by the TryIntoArrow trait - let arrow_array: Box = original_array.try_into_arrow().unwrap(); + let arrow_array: Arc = original_array.try_into_arrow().unwrap(); // which can be cast to an Arrow StructArray and be used for all kinds of IPC, FFI, etc. - // supported by `arrow2` + // supported by `arrow` let struct_array = arrow_array .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(struct_array.len(), 3); diff --git a/arrow2_convert/tests/test_deserialize.rs b/arrow_convert/tests/test_deserialize.rs similarity index 56% rename from arrow2_convert/tests/test_deserialize.rs rename to arrow_convert/tests/test_deserialize.rs index 1d4efaa..c12cef0 100644 --- a/arrow2_convert/tests/test_deserialize.rs +++ b/arrow_convert/tests/test_deserialize.rs @@ -1,12 +1,13 @@ -use arrow2::error::Result; -use arrow2::{array::*, buffer::Buffer}; -use arrow2_convert::{deserialize::*, serialize::*, ArrowDeserialize, ArrowField, ArrowSerialize}; +use arrow::buffer::ScalarBuffer; +use arrow::error::Result; +use arrow::{array::*, buffer::Buffer}; +use arrow_convert::{deserialize::*, serialize::*, ArrowDeserialize, ArrowField, ArrowSerialize}; #[test] fn test_deserialize_iterator() { - use arrow2::array::*; - use arrow2_convert::deserialize::*; - use arrow2_convert::serialize::*; + use arrow::array::*; + use arrow_convert::deserialize::*; + use arrow_convert::serialize::*; use std::borrow::Borrow; #[derive(Debug, Clone, PartialEq, ArrowField, ArrowSerialize, ArrowDeserialize)] @@ -15,7 +16,7 @@ fn test_deserialize_iterator() { } let original_array = [S { a1: 1 }, S { a1: 100 }, S { a1: 1000 }]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let iter = arrow_array_deserialize_iterator::(b.borrow()).unwrap(); for (i, k) in iter.zip(original_array.iter()) { assert_eq!(&i, k); @@ -23,7 +24,7 @@ fn test_deserialize_iterator() { let original_array = [Some(Some(1_i32)), Some(Some(100)), Some(None), None]; let expected = [Some(Some(1_i32)), Some(Some(100)), None, None]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let iter = arrow_array_deserialize_iterator::>>(b.borrow()).unwrap(); for (i, k) in iter.zip(expected.iter()) { assert_eq!(&i, k); @@ -42,12 +43,12 @@ fn test_deserialize_schema_mismatch_error() { } let arr1 = vec![S1 { a: 1 }, S1 { a: 2 }]; - let arr1: Box = arr1.try_into_arrow().unwrap(); + let arr1: ArrayRef = arr1.try_into_arrow().unwrap(); let result: Result> = arr1.try_into_collection(); assert!(result.is_err()); let arr1 = vec![S1 { a: 1 }, S1 { a: 2 }]; - let arr1: Box = arr1.try_into_arrow().unwrap(); + let arr1: ArrayRef = arr1.try_into_arrow().unwrap(); let result: Result> = arr1.try_into_collection_as_type::(); assert!(result.is_err()); } @@ -60,7 +61,7 @@ fn test_deserialize_large_types_schema_mismatch_error() { } #[derive(Debug, Clone, PartialEq, ArrowField, ArrowSerialize, ArrowDeserialize)] struct S2 { - #[arrow_field(type = "arrow2_convert::field::LargeString")] + #[arrow_field(type = "arrow_convert::field::LargeString")] a: String, } @@ -72,38 +73,53 @@ fn test_deserialize_large_types_schema_mismatch_error() { a: "333".to_string(), }, ]; - let arr1: Box = arr1.try_into_arrow().unwrap(); + let arr1: ArrayRef = arr1.try_into_arrow().unwrap(); let result: Result> = arr1.try_into_collection(); assert!(result.is_err()); } #[test] -fn test_deserialize_buffer_u16() { +fn test_deserialize_scalar_buffer_u16() { + // test Buffer let original_array = [Buffer::from_iter(0u16..5), Buffer::from_iter(7..9)]; - let b: Box = original_array.try_into_arrow().unwrap(); - let iter = arrow_array_deserialize_iterator::>(b.as_ref()).unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); + let iter = arrow_array_deserialize_iterator::(b.as_ref()).unwrap(); + for (i, k) in iter.zip(original_array.iter()) { + assert_eq!(&i, k); + } + + // test ScalarBuffer + let original_array = [ + ScalarBuffer::from_iter(0u16..5), + ScalarBuffer::from_iter(7..9), + ]; + let b: ArrayRef = original_array.try_into_arrow().unwrap(); + let iter = arrow_array_deserialize_iterator::>(b.as_ref()).unwrap(); for (i, k) in iter.zip(original_array.iter()) { assert_eq!(&i, k); } } #[test] -fn test_deserialize_buffer_u8() { - let original_array = [Buffer::from_iter(0u8..5), Buffer::from_iter(7..9)]; - let b: Box = original_array.try_into_arrow().unwrap(); - let iter = arrow_array_deserialize_iterator::>(b.as_ref()).unwrap(); +fn test_deserialize_scalar_buffer_u8() { + let original_array = [ + ScalarBuffer::from_iter(0u8..5), + ScalarBuffer::from_iter(7..9), + ]; + let b: ArrayRef = original_array.try_into_arrow().unwrap(); + let iter = arrow_array_deserialize_iterator::>(b.as_ref()).unwrap(); for (i, k) in iter.zip(original_array.iter()) { assert_eq!(&i, k); } let original_array = [ - Some(Buffer::from_iter(0u8..5)), + Some(ScalarBuffer::from_iter(0u8..5)), None, - Some(Buffer::from_iter(7..9)), + Some(ScalarBuffer::from_iter(7..9)), ]; - let b: Box = original_array.try_into_arrow().unwrap(); - let iter = arrow_array_deserialize_iterator::>>(b.as_ref()).unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); + let iter = arrow_array_deserialize_iterator::>>(b.as_ref()).unwrap(); for (i, k) in iter.zip(original_array.iter()) { assert_eq!(&i, k); } diff --git a/arrow2_convert/tests/test_enum.rs b/arrow_convert/tests/test_enum.rs similarity index 91% rename from arrow2_convert/tests/test_enum.rs rename to arrow_convert/tests/test_enum.rs index c39fe9b..d1f80ec 100644 --- a/arrow2_convert/tests/test_enum.rs +++ b/arrow_convert/tests/test_enum.rs @@ -1,5 +1,5 @@ -use arrow2::array::*; -use arrow2_convert::{ +use arrow::array::*; +use arrow_convert::{ deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowDeserialize, ArrowField, ArrowSerialize, }; @@ -21,7 +21,7 @@ fn test_dense_enum_unit_variant() { TestEnum::VAL3, TestEnum::VAL4, ]; - let b: Box = enums.try_into_arrow().unwrap(); + let b: ArrayRef = enums.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(round_trip, enums); } @@ -43,7 +43,7 @@ fn test_sparse_enum_unit_variant() { TestEnum::VAL3, TestEnum::VAL4, ]; - let b: Box = enums.try_into_arrow().unwrap(); + let b: ArrayRef = enums.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(round_trip, enums); } @@ -104,7 +104,7 @@ fn test_nested_unit_variant() { TestDenseEnum::VAL6(SparseChildEnum::VAL4(TestStruct { a1: 42 })), ]; - let b: Box = enums.try_into_arrow().unwrap(); + let b: ArrayRef = enums.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(round_trip, enums); @@ -117,7 +117,7 @@ fn test_nested_unit_variant() { TestSparseEnum::VAL6(SparseChildEnum::VAL4(TestStruct { a1: 42 })), ]; - let b: Box = enums.try_into_arrow().unwrap(); + let b: ArrayRef = enums.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(round_trip, enums); } @@ -158,10 +158,10 @@ fn test_slice() { TestEnum::VAL4(TestStruct { a1: 10 }), ]; - let b: Box = enums.try_into_arrow().unwrap(); + let b: ArrayRef = enums.try_into_arrow().unwrap(); for i in 0..enums.len() { - let arrow_slice = b.sliced(i, enums.len() - i); + let arrow_slice = b.slice(i, enums.len() - i); let original_slice = &enums[i..enums.len()]; let round_trip: Vec = arrow_slice.try_into_collection().unwrap(); assert_eq!(round_trip, original_slice); diff --git a/arrow_convert/tests/test_flatten_record_batch.rs b/arrow_convert/tests/test_flatten_record_batch.rs new file mode 100644 index 0000000..ce71864 --- /dev/null +++ b/arrow_convert/tests/test_flatten_record_batch.rs @@ -0,0 +1,77 @@ +use arrow::datatypes::{Field, Schema}; +use arrow::record_batch::RecordBatch; +use arrow::{array::*, datatypes::DataType}; +use arrow_convert::{serialize::*, ArrowField, ArrowSerialize}; +use std::sync::Arc; + +#[test] +fn test_flatten_chunk() { + #[derive(Debug, Clone, ArrowField, ArrowSerialize)] + struct Struct { + a: i64, + b: i64, + } + + let target = RecordBatch::try_from_iter([ + ( + "a", + Arc::new(Int64Array::from_iter(&[Some(1), Some(2)])) as ArrayRef, + ), + ( + "b", + Arc::new(Int64Array::from_iter(&[Some(1), Some(2)])) as ArrayRef, + ), + ]) + .unwrap(); + + let array = vec![Struct { a: 1, b: 1 }, Struct { a: 2, b: 2 }]; + + let array: ArrayRef = array.try_into_arrow().unwrap(); + let chunk: RecordBatch = RecordBatch::try_from_iter([("struct", array)]).unwrap(); + + let flattened: RecordBatch = chunk.flatten().unwrap(); + + assert_eq!(flattened, target); +} + +#[test] +fn test_flatten_chunk_empty_chunk_error() { + let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + let chunk: RecordBatch = RecordBatch::new_empty(Arc::new(schema)); + assert!(chunk.flatten().is_err()); +} + +#[test] +fn test_flatten_chunk_no_single_struct_array_error() { + #[derive(Debug, Clone, ArrowField, ArrowSerialize)] + struct Struct { + a: i64, + b: String, + } + + let array = vec![ + Struct { + a: 1, + b: "one".to_string(), + }, + Struct { + a: 2, + b: "two".to_string(), + }, + ]; + + let array: ArrayRef = array.try_into_arrow().unwrap(); + + let arrays = vec![("s1", array.clone()), ("s2", array.clone())]; + let chunk = RecordBatch::try_from_iter(arrays).unwrap(); + + assert!(chunk.flatten().is_err()); +} + +#[test] +fn test_flatten_chunk_type_not_struct_error() { + let array: ArrayRef = Arc::new(Int32Array::from_iter(&[Some(1), None, Some(3)])); + let chunk = RecordBatch::try_from_iter(vec![("array", array)]).unwrap(); + + assert!(chunk.flatten().is_err()); +} diff --git a/arrow2_convert/tests/test_hygeine.rs b/arrow_convert/tests/test_hygeine.rs similarity index 83% rename from arrow2_convert/tests/test_hygeine.rs rename to arrow_convert/tests/test_hygeine.rs index da22d31..1161162 100644 --- a/arrow2_convert/tests/test_hygeine.rs +++ b/arrow_convert/tests/test_hygeine.rs @@ -1,5 +1,5 @@ /// Tests that the macro generated code doesn't assume the presence of additional bindings and uses absolute paths -use arrow2_convert_derive::ArrowField; +use arrow_convert_derive::ArrowField; #[derive(ArrowField)] #[allow(dead_code)] diff --git a/arrow2_convert/tests/test_macro_errors.rs b/arrow_convert/tests/test_macro_errors.rs similarity index 100% rename from arrow2_convert/tests/test_macro_errors.rs rename to arrow_convert/tests/test_macro_errors.rs diff --git a/arrow2_convert/tests/test_round_trip.rs b/arrow_convert/tests/test_round_trip.rs similarity index 69% rename from arrow2_convert/tests/test_round_trip.rs rename to arrow_convert/tests/test_round_trip.rs index d9a6ed3..35b8586 100644 --- a/arrow2_convert/tests/test_round_trip.rs +++ b/arrow_convert/tests/test_round_trip.rs @@ -1,14 +1,14 @@ -use arrow2::array::*; -use arrow2::datatypes::*; -use arrow2_convert::deserialize::arrow_array_deserialize_iterator_as_type; -use arrow2_convert::deserialize::*; -use arrow2_convert::field::{LargeBinary, I128}; -use arrow2_convert::serialize::*; -use arrow2_convert::{ +use arrow::array::*; +use arrow::datatypes::*; +use arrow_convert::deserialize::arrow_array_deserialize_iterator_as_type; +use arrow_convert::deserialize::*; +use arrow_convert::field::{LargeBinary, I128}; +use arrow_convert::serialize::*; +use arrow_convert::{ field::{FixedSizeBinary, FixedSizeVec, LargeString, LargeVec}, ArrowDeserialize, ArrowField, ArrowSerialize, }; -use std::borrow::Borrow; +use half::f16; use std::f32::INFINITY; use std::sync::Arc; @@ -40,7 +40,7 @@ fn test_nested_optional_struct_array() { }, ]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); } @@ -48,7 +48,7 @@ fn test_nested_optional_struct_array() { #[test] fn test_large_string() { let strs = vec!["1".to_string(), "2".to_string()]; - let b: Box = strs.try_into_arrow_as_type::().unwrap(); + let b: ArrayRef = strs.try_into_arrow_as_type::().unwrap(); assert_eq!(b.data_type(), &DataType::LargeUtf8); let round_trip: Vec = b.try_into_collection_as_type::().unwrap(); assert_eq!(round_trip, strs); @@ -57,10 +57,10 @@ fn test_large_string() { #[test] fn test_large_string_nested() { let strs = [vec!["1".to_string(), "2".to_string()]]; - let b: Box = strs.try_into_arrow_as_type::>().unwrap(); + let b: ArrayRef = strs.try_into_arrow_as_type::>().unwrap(); assert_eq!( b.data_type(), - &DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, false))) + &DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false))) ); let round_trip: Vec> = b.try_into_collection_as_type::>().unwrap(); assert_eq!(round_trip, strs); @@ -69,7 +69,7 @@ fn test_large_string_nested() { #[test] fn test_large_binary() { let strs = [b"abc".to_vec()]; - let b: Box = strs.try_into_arrow_as_type::().unwrap(); + let b: ArrayRef = strs.try_into_arrow_as_type::().unwrap(); assert_eq!(b.data_type(), &DataType::LargeBinary); let round_trip: Vec> = b.try_into_collection_as_type::().unwrap(); assert_eq!(round_trip, strs); @@ -78,10 +78,10 @@ fn test_large_binary() { #[test] fn test_large_binary_nested() { let strs = [vec![b"abc".to_vec(), b"abd".to_vec()]]; - let b: Box = strs.try_into_arrow_as_type::>().unwrap(); + let b: ArrayRef = strs.try_into_arrow_as_type::>().unwrap(); assert_eq!( b.data_type(), - &DataType::List(Box::new(Field::new("item", DataType::LargeBinary, false))) + &DataType::List(Arc::new(Field::new("item", DataType::LargeBinary, false))) ); let round_trip: Vec>> = b.try_into_collection_as_type::>().unwrap(); @@ -91,7 +91,7 @@ fn test_large_binary_nested() { #[test] fn test_fixed_size_binary() { let strs = [b"abc".to_vec()]; - let b: Box = strs.try_into_arrow_as_type::>().unwrap(); + let b: ArrayRef = strs.try_into_arrow_as_type::>().unwrap(); assert_eq!(b.data_type(), &DataType::FixedSizeBinary(3)); let round_trip: Vec> = b .try_into_collection_as_type::>() @@ -102,10 +102,10 @@ fn test_fixed_size_binary() { #[test] fn test_large_vec() { let ints = vec![vec![1, 2, 3]]; - let b: Box = ints.try_into_arrow_as_type::>().unwrap(); + let b: ArrayRef = ints.try_into_arrow_as_type::>().unwrap(); assert_eq!( b.data_type(), - &DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))) + &DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))) ); let round_trip: Vec> = b.try_into_collection_as_type::>().unwrap(); assert_eq!(round_trip, ints); @@ -114,12 +114,12 @@ fn test_large_vec() { #[test] fn test_large_vec_nested() { let strs = [vec![b"abc".to_vec(), b"abd".to_vec()]]; - let b: Box = strs + let b: ArrayRef = strs .try_into_arrow_as_type::>() .unwrap(); assert_eq!( b.data_type(), - &DataType::LargeList(Box::new(Field::new("item", DataType::LargeBinary, false))) + &DataType::LargeList(Arc::new(Field::new("item", DataType::LargeBinary, false))) ); let round_trip: Vec>> = b .try_into_collection_as_type::>() @@ -130,12 +130,12 @@ fn test_large_vec_nested() { #[test] fn test_fixed_size_vec() { let ints = vec![vec![1, 2, 3]]; - let b: Box = ints + let b: ArrayRef = ints .try_into_arrow_as_type::>() .unwrap(); assert_eq!( b.data_type(), - &DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int32, false)), 3) + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3) ); let round_trip: Vec> = b .try_into_collection_as_type::>() @@ -148,12 +148,12 @@ fn test_primitive_type_vec() { macro_rules! test_int_type { ($t:ty) => { let original_array = vec![1 as $t, 2, 3]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec<$t> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); let original_array = vec![Some(1 as $t), None, Some(3)]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); @@ -168,12 +168,12 @@ fn test_primitive_type_vec() { macro_rules! test_float_type { ($t:ty) => { let original_array = vec![1 as $t, 2., 3.]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec<$t> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); let original_array = vec![Some(1 as $t), None, Some(3.)]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); @@ -195,31 +195,30 @@ fn test_primitive_type_vec() { test_float_type!(f32); test_float_type!(f64); - // `arrow2::types::f16` isn't a native type so we can't just use `as` + // `f16` isn't a native type so we can't just use `as` { - let original_array: Vec = - vec![1.0, 2.5, 47800.0, 0.000012, -0.0, 0.0, INFINITY] - .iter() - .map(|f| arrow2::types::f16::from_f32(*f)) - .collect(); - let b: Box = original_array.try_into_arrow().unwrap(); - let round_trip: Vec = b.try_into_collection().unwrap(); + let original_array: Vec = [1.0, 2.5, 47800.0, 0.000012, -0.0, 0.0, INFINITY] + .iter() + .map(|f| f16::from_f32(*f)) + .collect(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); + let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); - let original_array: Vec> = vec![Some(1.), None, Some(3.)] + let original_array: Vec> = [Some(1.), None, Some(3.)] .iter() - .map(|f| f.map(arrow2::types::f16::from_f32)) + .map(|f| f.map(f16::from_f32)) .collect(); - let b: Box = original_array.try_into_arrow().unwrap(); - let round_trip: Vec> = b.try_into_collection().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); + let round_trip: Vec> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); - let original_array: Vec> = vec![Some(1.), None, Some(3.)] + let original_array: Vec> = [Some(1.), None, Some(3.)] .iter() - .map(|f| f.map(arrow2::types::f16::from_f32)) + .map(|f| f.map(f16::from_f32)) .collect(); let b: Arc = original_array.try_into_arrow().unwrap(); - let round_trip: Vec> = b.try_into_collection().unwrap(); + let round_trip: Vec> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); }; @@ -227,39 +226,41 @@ fn test_primitive_type_vec() { // i128 is special since we need to require precision and scale so the TryIntoArrow trait // is not implemented for Vec. let original_array = vec![1_i128, 2, 3]; - let b: Box = arrow_serialize_to_mutable_array::<_, I128<32, 32>, _>(&original_array) - .unwrap() - .as_box(); - let round_trip: Vec = - arrow_array_deserialize_iterator_as_type::<_, I128<32, 32>>(b.borrow()) + let b: ArrayRef = Arc::new( + arrow_serialize_to_mutable_array::<_, I128<32, 32>, _>(&original_array) .unwrap() - .collect(); + .finish(), + ); + let round_trip: Vec = arrow_array_deserialize_iterator_as_type::<_, I128<32, 32>>(&b) + .unwrap() + .collect(); assert_eq!(original_array, round_trip); let original_array = vec![Some(1_i128), None, Some(3)]; - let b: Box = + let b: ArrayRef = Arc::new( arrow_serialize_to_mutable_array::<_, Option>, _>(&original_array) .unwrap() - .as_box(); + .finish(), + ); let round_trip: Vec> = - arrow_array_deserialize_iterator_as_type::<_, Option>>(b.borrow()) + arrow_array_deserialize_iterator_as_type::<_, Option>>(&b) .unwrap() .collect(); assert_eq!(original_array, round_trip); // bool let original_array = vec![false, true, false]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); let original_array = vec![Some(false), Some(true), None]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); let original_array = vec![Some(b"aa".to_vec()), None]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec>> = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); } @@ -271,11 +272,11 @@ fn test_escaped_name() { r#type: bool, } let array = [EscapedName { r#type: true }, EscapedName { r#type: false }]; - let b: Box = array.try_into_arrow().unwrap(); + let b: ArrayRef = array.try_into_arrow().unwrap(); let ty = b.data_type(); match ty { DataType::Struct(s) => { - assert_eq!(s[0].name, "type"); + assert_eq!(s[0].name(), "type"); } _ => unreachable!(), } diff --git a/arrow2_convert/tests/test_schema.rs b/arrow_convert/tests/test_schema.rs similarity index 52% rename from arrow2_convert/tests/test_schema.rs rename to arrow_convert/tests/test_schema.rs index 1e465df..97f72f9 100644 --- a/arrow2_convert/tests/test_schema.rs +++ b/arrow_convert/tests/test_schema.rs @@ -1,5 +1,7 @@ -use arrow2::datatypes::*; -use arrow2_convert::ArrowField; +use std::sync::Arc; + +use arrow::datatypes::*; +use arrow_convert::ArrowField; #[test] fn test_schema_types() { @@ -19,7 +21,7 @@ fn test_schema_types() { // timestamp(ns, None) a6: Option, // i128(precision, scale) - #[arrow_field(type = "arrow2_convert::field::I128<32, 32>")] + #[arrow_field(type = "arrow_convert::field::I128<32, 32>")] a7: i128, // array of date times date_time_list: Vec, @@ -27,30 +29,30 @@ fn test_schema_types() { nullable_list: Option>>, // optional list array of required strings required_list: Vec>, - // custom type - custom: CustomType, - // custom optional type - nullable_custom: Option, - // vec custom type - custom_list: Vec, + // // custom type + // custom: CustomType, + // // custom optional type + // nullable_custom: Option, + // // vec custom type + // custom_list: Vec, // nested struct child: Child, // int 32 array int32_array: Vec, // large binary - #[arrow_field(type = "arrow2_convert::field::LargeBinary")] + #[arrow_field(type = "arrow_convert::field::LargeBinary")] large_binary: Vec, // fixed size binary - #[arrow_field(type = "arrow2_convert::field::FixedSizeBinary<3>")] + #[arrow_field(type = "arrow_convert::field::FixedSizeBinary<3>")] fixed_size_binary: Vec, // large string - #[arrow_field(type = "arrow2_convert::field::LargeString")] + #[arrow_field(type = "arrow_convert::field::LargeString")] large_string: String, // large vec - #[arrow_field(type = "arrow2_convert::field::LargeVec")] + #[arrow_field(type = "arrow_convert::field::LargeVec")] large_vec: Vec, // fixed size vec - #[arrow_field(type = "arrow2_convert::field::FixedSizeVec")] + #[arrow_field(type = "arrow_convert::field::FixedSizeVec")] fixed_size_vec: Vec, } @@ -72,53 +74,53 @@ fn test_schema_types() { } // enable Vec - arrow2_convert::arrow_enable_vec_for_type!(CustomType); - - #[derive(Debug)] - /// A newtype around a u64 - pub struct CustomType(u64); - - impl arrow2_convert::field::ArrowField for CustomType { - type Type = Self; - - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Extension( - "custom".to_string(), - Box::new(arrow2::datatypes::DataType::UInt64), - None, - ) - } - } - - impl arrow2_convert::serialize::ArrowSerialize for CustomType { - type MutableArrayType = arrow2::array::MutablePrimitiveArray; - - #[inline] - fn new_array() -> Self::MutableArrayType { - unimplemented!(); - } - - #[inline] - fn arrow_serialize( - _v: &Self, - _array: &mut Self::MutableArrayType, - ) -> arrow2::error::Result<()> { - unimplemented!(); - } - } - - impl arrow2_convert::deserialize::ArrowDeserialize for CustomType { - type ArrayType = arrow2::array::PrimitiveArray; - - #[inline] - fn arrow_deserialize(_v: Option<&u64>) -> Option { - unimplemented!(); - } - } + // arrow_convert::arrow_enable_vec_for_type!(CustomType); + + // #[derive(Debug)] + // /// A newtype around a u64 + // pub struct CustomType(u64); + + // impl arrow_convert::field::ArrowField for CustomType { + // type Type = Self; + + // fn data_type() -> arrow::datatypes::DataType { + // arrow::datatypes::DataType::Extension( + // "custom".to_string(), + // Arc::new(arrow::datatypes::DataType::UInt64), + // None, + // ) + // } + // } + + // impl arrow_convert::serialize::ArrowSerialize for CustomType { + // type ArrayBuilderType = arrow::array::UInt64Builder; + + // #[inline] + // fn new_array() -> Self::ArrayBuilderType { + // unimplemented!(); + // } + + // #[inline] + // fn arrow_serialize( + // _v: &Self, + // _array: &mut Self::ArrayBuilderType, + // ) -> arrow::error::Result<()> { + // unimplemented!(); + // } + // } + + // impl arrow_convert::deserialize::ArrowDeserialize for CustomType { + // type ArrayType = arrow::array::PrimitiveArray; + + // #[inline] + // fn arrow_deserialize(_v: Option) -> Option { + // unimplemented!(); + // } + // } assert_eq!( - ::data_type(), - DataType::Struct(vec![ + ::data_type(), + DataType::Struct(Fields::from(vec![ Field::new("name", DataType::Utf8, true), Field::new("is_deleted", DataType::Boolean, false), Field::new("a1", DataType::Float64, true), @@ -127,10 +129,10 @@ fn test_schema_types() { Field::new("a4", DataType::Date32, false), Field::new("a5", DataType::Timestamp(TimeUnit::Nanosecond, None), false), Field::new("a6", DataType::Timestamp(TimeUnit::Nanosecond, None), true), - Field::new("a7", DataType::Decimal(32, 32), false), + Field::new("a7", DataType::Decimal128(32, 32), false), Field::new( "date_time_list", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Nanosecond, None), false @@ -139,47 +141,47 @@ fn test_schema_types() { ), Field::new( "nullable_list", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), true ), Field::new( "required_list", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - false - ), - Field::new( - "custom", - DataType::Extension("custom".to_string(), Box::new(DataType::UInt64), None), - false - ), - Field::new( - "nullable_custom", - DataType::Extension("custom".to_string(), Box::new(DataType::UInt64), None), - true - ), - Field::new( - "custom_list", - DataType::List(Box::new(Field::new( - "item", - DataType::Extension("custom".to_string(), Box::new(DataType::UInt64), None), - false - ))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), false ), + // Field::new( + // "custom", + // DataType::Extension("custom".to_string(), Arc::new(DataType::UInt64), None), + // false + // ), + // Field::new( + // "nullable_custom", + // DataType::Extension("custom".to_string(), Arc::new(DataType::UInt64), None), + // true + // ), + // Field::new( + // "custom_list", + // DataType::List(Arc::new(Field::new( + // "item", + // DataType::Extension("custom".to_string(), Arc::new(DataType::UInt64), None), + // false + // ))), + // false + // ), Field::new( "child", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a1", DataType::Int64, false), Field::new("a2", DataType::Utf8, false), Field::new( "child_array", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a1", DataType::Int32, false), Field::new( "bool_array", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Boolean, false @@ -188,24 +190,24 @@ fn test_schema_types() { ), Field::new( "int64_array", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Int64, false ))), false ), - ]), + ])), false ))), false ) - ]), + ])), false ), Field::new( "int32_array", - DataType::List(Box::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), false ), Field::new("large_binary", DataType::LargeBinary, false), @@ -213,31 +215,31 @@ fn test_schema_types() { Field::new("large_string", DataType::LargeUtf8, false), Field::new( "large_vec", - DataType::LargeList(Box::new(Field::new("item", DataType::Int64, false))), + DataType::LargeList(Arc::new(Field::new("item", DataType::Int64, false))), false ), Field::new( "fixed_size_vec", - DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int64, false)), 3), + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 3), false ), - ]) + ])) ); } #[test] fn test_large_string_schema() { - use arrow2_convert::field::LargeString; + use arrow_convert::field::LargeString; assert_eq!( - ::data_type(), + ::data_type(), DataType::LargeUtf8 ); - assert!(!::is_nullable()); - assert!( as arrow2_convert::field::ArrowField>::is_nullable()); + assert!(!::is_nullable()); + assert!( as arrow_convert::field::ArrowField>::is_nullable()); assert_eq!( - as arrow2_convert::field::ArrowField>::data_type(), - DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, false))) + as arrow_convert::field::ArrowField>::data_type(), + DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false))) ); } diff --git a/arrow_convert/tests/test_serialize.rs b/arrow_convert/tests/test_serialize.rs new file mode 100644 index 0000000..b1a9294 --- /dev/null +++ b/arrow_convert/tests/test_serialize.rs @@ -0,0 +1,117 @@ +use arrow::array::{Array, ArrayRef}; +use arrow::buffer::{Buffer, ScalarBuffer}; +use arrow::record_batch::RecordBatch; +use arrow_convert::field::{ArrowField, FixedSizeBinary}; +use arrow_convert::serialize::*; + +#[test] +fn test_error_exceed_fixed_size_binary() { + let strs = [b"abc".to_vec()]; + let r: arrow::error::Result = strs.try_into_arrow_as_type::>(); + assert!(r.is_err()) +} + +#[test] +fn test_record_batch() { + let strs = [b"abc".to_vec()]; + let r: RecordBatch = strs.try_into_arrow_as_type::>().unwrap(); + assert_eq!(r.num_rows(), 1); + assert_eq!( + r.columns()[0].data_type(), + & as ArrowField>::data_type() + ); + + let r: RecordBatch = strs.try_into_arrow().unwrap(); + assert_eq!(r.num_rows(), 1); + assert_eq!( + r.columns()[0].data_type(), + & as ArrowField>::data_type() + ); +} + +#[test] +fn test_array() { + let strs = [b"abc".to_vec()]; + let r: ArrayRef = strs.try_into_arrow_as_type::>().unwrap(); + assert_eq!(r.len(), 1); + assert_eq!( + r.data_type(), + & as ArrowField>::data_type() + ); + + let r: ArrayRef = strs.try_into_arrow().unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); +} + +#[test] +fn test_buffer() { + // Buffer, ScalarBuffer and Vec should serialize into BinaryArray + let b: Vec = vec![(0..10).collect()]; + let rb: ArrayRef = b.try_into_arrow().unwrap(); + let dat: Vec> = vec![(0..10).collect()]; + let r: ArrayRef = dat.try_into_arrow().unwrap(); + assert_eq!(rb.len(), 1); + assert_eq!(r.len(), 1); + assert_eq!(r.data_type(), &::data_type()); + assert_eq!( + r.data_type(), + & as ArrowField>::data_type() + ); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); + + // ScalarBuffer and Vec should serialize into ListArray + let dat: Vec> = vec![(0..10).collect()]; + let r: ArrayRef = dat.try_into_arrow().unwrap(); + assert_eq!(r.len(), 1); + assert_eq!( + r.data_type(), + & as ArrowField>::data_type() + ); + assert_eq!(r.data_type(), & as ArrowField>::data_type()); +} + +// #[test] +// fn test_field_serialize_error() { +// pub struct CustomType(u64); + +// impl arrow_convert::field::ArrowField for CustomType { +// type Type = Self; + +// #[inline] +// fn data_type() -> arrow::datatypes::DataType { +// arrow::datatypes::DataType::Extension( +// "custom".to_string(), +// Box::new(arrow::datatypes::DataType::UInt64), +// None, +// ) +// } +// } + +// impl arrow_convert::serialize::ArrowSerialize for CustomType { +// type ArrayBuilderType = arrow::array::UInt64Builder; + +// #[inline] +// fn new_array() -> Self::ArrayBuilderType { +// Self::ArrayBuilderType::from(::data_type()) +// } + +// #[inline] +// fn arrow_serialize(_: &Self, _: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { +// Err(arrow::error::Error::NotYetImplemented("".to_owned())) +// } +// } + +// impl arrow_convert::deserialize::ArrowDeserialize for CustomType { +// type ArrayType = arrow::array::PrimitiveArray; + +// #[inline] +// fn arrow_deserialize(v: Option<&u64>) -> Option { +// v.map(|t| CustomType(*t)) +// } +// } + +// let arr = vec![CustomType(0)]; +// let r: arrow::error::Result = arr.try_into_arrow(); +// assert!(r.is_err()) +// } diff --git a/arrow2_convert/tests/test_struct.rs b/arrow_convert/tests/test_struct.rs similarity index 83% rename from arrow2_convert/tests/test_struct.rs rename to arrow_convert/tests/test_struct.rs index b9cd3c1..cf9b1fc 100644 --- a/arrow2_convert/tests/test_struct.rs +++ b/arrow_convert/tests/test_struct.rs @@ -1,5 +1,5 @@ -use arrow2::array::*; -use arrow2_convert::{deserialize::*, serialize::*, ArrowDeserialize, ArrowField, ArrowSerialize}; +use arrow::array::*; +use arrow_convert::{deserialize::*, serialize::*, ArrowDeserialize, ArrowField, ArrowSerialize}; #[test] fn test_nested_optional_struct_array() { @@ -29,7 +29,7 @@ fn test_nested_optional_struct_array() { }, ]; - let b: Box = original_array.try_into_arrow().unwrap(); + let b: ArrayRef = original_array.try_into_arrow().unwrap(); let round_trip: Vec = b.try_into_collection().unwrap(); assert_eq!(original_array, round_trip); } @@ -43,10 +43,10 @@ fn test_slice() { let original = vec![T { a1: 1 }, T { a1: 2 }, T { a1: 3 }, T { a1: 4 }]; - let b: Box = original.try_into_arrow().unwrap(); + let b: ArrayRef = original.try_into_arrow().unwrap(); for i in 0..original.len() { - let arrow_slice = b.sliced(i, original.len() - i); + let arrow_slice = b.slice(i, original.len() - i); let original_slice = &original[i..original.len()]; let round_trip: Vec = arrow_slice.try_into_collection().unwrap(); assert_eq!(round_trip, original_slice); @@ -81,10 +81,10 @@ fn test_nested_slice() { }, ]; - let b: Box = original.try_into_arrow().unwrap(); + let b: ArrayRef = original.try_into_arrow().unwrap(); for i in 0..original.len() { - let arrow_slice = b.sliced(i, original.len() - i); + let arrow_slice = b.slice(i, original.len() - i); let original_slice = &original[i..original.len()]; let round_trip: Vec = arrow_slice.try_into_collection().unwrap(); assert_eq!(round_trip, original_slice); diff --git a/arrow2_convert/tests/ui/derive_on_trait.rs b/arrow_convert/tests/ui/derive_on_trait.rs similarity index 61% rename from arrow2_convert/tests/ui/derive_on_trait.rs rename to arrow_convert/tests/ui/derive_on_trait.rs index 51b011e..747a365 100644 --- a/arrow2_convert/tests/ui/derive_on_trait.rs +++ b/arrow_convert/tests/ui/derive_on_trait.rs @@ -1,4 +1,4 @@ -use arrow2_convert::ArrowField; +use arrow_convert::ArrowField; #[derive(ArrowField)] trait Test diff --git a/arrow2_convert/tests/ui/derive_on_trait.stderr b/arrow_convert/tests/ui/derive_on_trait.stderr similarity index 74% rename from arrow2_convert/tests/ui/derive_on_trait.stderr rename to arrow_convert/tests/ui/derive_on_trait.stderr index e8e5562..1cc28bc 100644 --- a/arrow2_convert/tests/ui/derive_on_trait.stderr +++ b/arrow_convert/tests/ui/derive_on_trait.stderr @@ -7,10 +7,10 @@ error[E0774]: `derive` may only be applied to `struct`s, `enum`s and `union`s 5 | | {} | |__- not a `struct`, `enum` or `union` -warning: unused import: `arrow2_convert::ArrowField` +warning: unused import: `arrow_convert::ArrowField` --> tests/ui/derive_on_trait.rs:1:5 | -1 | use arrow2_convert::ArrowField; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^ +1 | use arrow_convert::ArrowField; + | ^^^^^^^^^^^^^^^^^^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default diff --git a/arrow2_convert/tests/ui/struct_incorrect_type.rs b/arrow_convert/tests/ui/struct_incorrect_type.rs similarity index 58% rename from arrow2_convert/tests/ui/struct_incorrect_type.rs rename to arrow_convert/tests/ui/struct_incorrect_type.rs index 11bf47b..9129119 100644 --- a/arrow2_convert/tests/ui/struct_incorrect_type.rs +++ b/arrow_convert/tests/ui/struct_incorrect_type.rs @@ -1,5 +1,5 @@ -use arrow2_convert::field::LargeBinary; -use arrow2_convert::{ArrowDeserialize, ArrowField, ArrowSerialize}; +use arrow_convert::field::LargeBinary; +use arrow_convert::{ArrowDeserialize, ArrowField, ArrowSerialize}; #[derive(Debug, ArrowField, ArrowSerialize, ArrowDeserialize)] struct Test { diff --git a/arrow2_convert/tests/ui/struct_incorrect_type.stderr b/arrow_convert/tests/ui/struct_incorrect_type.stderr similarity index 92% rename from arrow2_convert/tests/ui/struct_incorrect_type.stderr rename to arrow_convert/tests/ui/struct_incorrect_type.stderr index 0e5d3e4..8e6b124 100644 --- a/arrow2_convert/tests/ui/struct_incorrect_type.stderr +++ b/arrow_convert/tests/ui/struct_incorrect_type.stderr @@ -5,6 +5,7 @@ error[E0277]: the trait bound `String: Borrow>` is not satisfied | ^^^^^^^^^^^^^^ the trait `Borrow>` is not implemented for `String` | = help: the trait `Borrow` is implemented for `String` + = help: for that trait implementation, expected `str`, found `Vec` = note: this error originates in the derive macro `ArrowSerialize` (in Nightly builds, run with -Z macro-backtrace for more info) error[E0308]: mismatched types diff --git a/arrow2_convert/tests/ui/struct_no_fields.rs b/arrow_convert/tests/ui/struct_no_fields.rs similarity index 60% rename from arrow2_convert/tests/ui/struct_no_fields.rs rename to arrow_convert/tests/ui/struct_no_fields.rs index 45ea4ca..a2c1374 100644 --- a/arrow2_convert/tests/ui/struct_no_fields.rs +++ b/arrow_convert/tests/ui/struct_no_fields.rs @@ -1,4 +1,4 @@ -use arrow2_convert::ArrowField; +use arrow_convert::ArrowField; #[derive(ArrowField)] struct S {} diff --git a/arrow2_convert/tests/ui/struct_no_fields.stderr b/arrow_convert/tests/ui/struct_no_fields.stderr similarity index 100% rename from arrow2_convert/tests/ui/struct_no_fields.stderr rename to arrow_convert/tests/ui/struct_no_fields.stderr diff --git a/arrow2_convert_derive/Cargo.toml b/arrow_convert_derive/Cargo.toml similarity index 58% rename from arrow2_convert_derive/Cargo.toml rename to arrow_convert_derive/Cargo.toml index d5a51d8..d443683 100644 --- a/arrow2_convert_derive/Cargo.toml +++ b/arrow_convert_derive/Cargo.toml @@ -1,15 +1,16 @@ [package] -name = "arrow2_convert_derive" -version = "0.5.0" +name = "arrow_convert_derive" +version = "0.6.0" authors = [ + "Swoorup Joshi ", "Jorge Leitao ", "Chandra Penke " ] edition = "2021" license = "Apache-2.0 OR MIT" -keywords = ["Arrow", "arrow2"] -repository = "https://github.com/DataEngineeringLabs/arrow2-convert" -description = "Proc macros for arrow2_convert" +keywords = ["Arrow", "arrow"] +repository = "https://github.com/Swoorup/arrow-convert" +description = "Proc macros for arrow_convert" [lib] proc-macro = true diff --git a/arrow2_convert_derive/LICENSE-APACHE b/arrow_convert_derive/LICENSE-APACHE similarity index 100% rename from arrow2_convert_derive/LICENSE-APACHE rename to arrow_convert_derive/LICENSE-APACHE diff --git a/arrow2_convert_derive/LICENSE-MIT b/arrow_convert_derive/LICENSE-MIT similarity index 100% rename from arrow2_convert_derive/LICENSE-MIT rename to arrow_convert_derive/LICENSE-MIT diff --git a/arrow2_convert_derive/README.md b/arrow_convert_derive/README.md similarity index 100% rename from arrow2_convert_derive/README.md rename to arrow_convert_derive/README.md diff --git a/arrow2_convert_derive/src/attr.rs b/arrow_convert_derive/src/attr.rs similarity index 100% rename from arrow2_convert_derive/src/attr.rs rename to arrow_convert_derive/src/attr.rs diff --git a/arrow2_convert_derive/src/derive_enum.rs b/arrow_convert_derive/src/derive_enum.rs similarity index 58% rename from arrow2_convert_derive/src/derive_enum.rs rename to arrow_convert_derive/src/derive_enum.rs index 86f2af4..348ac5c 100644 --- a/arrow2_convert_derive/src/derive_enum.rs +++ b/arrow_convert_derive/src/derive_enum.rs @@ -26,9 +26,9 @@ impl<'a> From<&'a DeriveEnum> for Common<'a> { let variants = &input.variants; let union_type = if is_dense { - quote!(arrow2::datatypes::UnionMode::Dense) + quote!(arrow::datatypes::UnionMode::Dense) } else { - quote!(arrow2::datatypes::UnionMode::Sparse) + quote!(arrow::datatypes::UnionMode::Sparse) }; let variant_names = variants @@ -87,24 +87,32 @@ pub fn expand_field(input: DeriveEnum) -> TokenStream { .. } = (&input).into(); + let num_variants = syn::LitInt::new( + &format!("{}", variant_types.len()), + proc_macro2::Span::call_site(), + ); + quote! { - impl arrow2_convert::field::ArrowField for #original_name { + impl arrow_convert::field::ArrowField for #original_name { type Type = Self; - - fn data_type() -> arrow2::datatypes::DataType { - arrow2::datatypes::DataType::Union( - vec![ - #( - <#variant_types as arrow2_convert::field::ArrowField>::field(#variant_names_str), - )* - ], - None, + type Native = Self; + + fn data_type() -> arrow::datatypes::DataType { + arrow::datatypes::DataType::Union( + arrow::datatypes::UnionFields::new( + 0..#num_variants, // basically union tag id or here called type_id + vec![ + #( + <#variant_types as arrow_convert::field::ArrowField>::field(#variant_names_str), + )* + ] + ), #union_type, ) } } - arrow2_convert::arrow_enable_vec_for_type!(#original_name); + arrow_convert::arrow_enable_vec_for_type!(#original_name); } } @@ -124,21 +132,35 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { let mutable_array_name = &input.common.mutable_array_name(); let mutable_variant_array_types = variant_types .iter() - .map(|field_type| quote_spanned!( field_type.span() => <#field_type as arrow2_convert::serialize::ArrowSerialize>::MutableArrayType)) + .map(|field_type| quote_spanned!( field_type.span() => <#field_type as arrow_convert::serialize::ArrowSerialize>::ArrayBuilderType)) .collect::>(); - let (offsets_decl, offsets_init, offsets_reserve, offsets_take, offsets_shrink_to_fit) = - if is_dense { - ( - quote! { offsets: Vec, }, - quote! { offsets: vec![], }, - quote! { self.offsets.reserve(additional); }, - quote! { Some(std::mem::take(&mut self.offsets).into()), }, - quote! { self.offsets.shrink_to_fit(); }, - ) - } else { - (quote! {}, quote! {}, quote! {}, quote! {None}, quote! {}) - }; + let ( + offsets_decl, + offsets_init, + offsets_reserve, + offsets_take, + offsets_clone, + offsets_shrink_to_fit, + ) = if is_dense { + ( + quote! { offsets: Vec, }, + quote! { offsets: vec![], }, + quote! { self.offsets.reserve(additional); }, + quote! { Some(arrow::buffer::Buffer::from_vec(std::mem::take(&mut self.offsets))) }, + quote! { Some(arrow::buffer::Buffer::from_slice_ref(&self.offsets)) }, + quote! { self.offsets.shrink_to_fit(); }, + ) + } else { + ( + quote! {}, + quote! {}, + quote! {}, + quote! {None}, + quote! {None}, + quote! {}, + ) + }; let try_push_match_blocks = variants .iter() @@ -153,13 +175,13 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { // There might be a better way of doing this. if is_dense { let update_offset = quote! { - self.types.push(#lit_idx); + self.type_ids.push(#lit_idx); self.offsets.push((self.#name.len() - 1) as i32); }; if v.is_unit { quote! { #original_name::#name => { - <#variant_type as arrow2_convert::serialize::ArrowSerialize>::arrow_serialize(&true, &mut self.#name)?; + <#variant_type as ArrowSerialize>::arrow_serialize(&true, &mut self.#name)?; #update_offset } } @@ -167,7 +189,7 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { else { quote! { #original_name::#name(v) => { - <#variant_type as arrow2_convert::serialize::ArrowSerialize>::arrow_serialize(v, &mut self.#name)?; + <#variant_type as ArrowSerialize>::arrow_serialize(v, &mut self.#name)?; #update_offset } } @@ -182,7 +204,7 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { let name = &y.syn.ident; if nested_idx != idx { quote! { - <<#variant_type as arrow2_convert::serialize::ArrowSerialize>::MutableArrayType as MutableArray>::push_null(&mut self.#name); + <<#variant_type as ArrowSerialize>::ArrayBuilderType as PushNull>::push_null(&mut self.#name); } } else { @@ -192,13 +214,13 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { .collect::>(); let update_offset = quote! { - self.types.push(#lit_idx); + self.type_ids.push(#lit_idx); }; if v.is_unit { quote! { #original_name::#name => { - <#variant_type as arrow2_convert::serialize::ArrowSerialize>::arrow_serialize(&true, &mut self.#name)?; + <#variant_type as ArrowSerialize>::arrow_serialize(&true, &mut self.#name)?; #( #push_none )* @@ -209,7 +231,7 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { else { quote! { #original_name::#name(v) => { - <#variant_type as arrow2_convert::serialize::ArrowSerialize>::arrow_serialize(v, &mut self.#name)?; + <#variant_type as ArrowSerialize>::arrow_serialize(v, &mut self.#name)?; #( #push_none )* @@ -221,22 +243,10 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { }) .collect::>(); - let try_push_none = if is_dense { - let first_array_type = &mutable_variant_array_types[0]; - let first_name = &variant_names[0]; - quote! { - self.types.push(0); - self.offsets.push((self.#first_name.len()) as i32); - <#first_array_type as MutableArray>::push_null(&mut self.#first_name); - } - } else { - quote! { - self.types.push(0); - #( - <#mutable_variant_array_types as MutableArray>::push_null(&mut self.#variant_names); - )* - } - }; + let num_variants = syn::LitInt::new( + &format!("{}", variant_types.len()), + proc_macro2::Span::call_site(), + ); let array_decl = quote! { #[allow(non_snake_case)] @@ -245,30 +255,55 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { #( #variant_names: #mutable_variant_array_types, )* - data_type: arrow2::datatypes::DataType, - types: Vec, + data_type: arrow::datatypes::DataType, + type_ids: Vec, + field_type_ids: [i8; #num_variants], #offsets_decl } }; + let field_type_ids_value = variant_indices + .iter() + .map(|idx| quote! {#idx}) + .collect::>(); + let array_impl = quote! { impl #mutable_array_name { pub fn new() -> Self { Self { - #(#variant_names: <#variant_types as arrow2_convert::serialize::ArrowSerialize>::new_array(),)* - data_type: <#original_name as arrow2_convert::field::ArrowField>::data_type(), - types: vec![], + #(#variant_names: <#variant_types as arrow_convert::serialize::ArrowSerialize>::new_array(),)* + data_type: <#original_name as arrow_convert::field::ArrowField>::data_type(), + type_ids: vec![], + field_type_ids: [ #( #field_type_ids_value ),* ], #offsets_init } } - } - }; - let array_try_push_impl = quote! { - impl<__T: std::borrow::Borrow<#original_name>> arrow2::array::TryPush> for #mutable_array_name { - fn try_push(&mut self, item: Option<__T>) -> arrow2::error::Result<()> { - use arrow2::array::MutableArray; + fn data_type(&self) -> &arrow::datatypes::DataType { + &self.data_type + } + + fn append_null(&mut self) { + use arrow_convert::serialize::PushNull; + self.try_push(None::<#original_name>).unwrap(); + } + fn validity(&self) -> Option<&arrow::array::BooleanBufferBuilder> { + None + } + + fn shrink_to_fit(&mut self) { + self.type_ids.shrink_to_fit(); + #offsets_shrink_to_fit + } + + fn reserve(&mut self, additional: usize) { + self.type_ids.reserve(additional); + #offsets_reserve + } + + fn try_push(&mut self, item: Option>) -> arrow::error::Result<()> { + use arrow_convert::serialize::{ArrowSerialize, PushNull}; match item { Some(i) => { match i.borrow() { @@ -278,11 +313,46 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { } }, None => { - #try_push_none + ::push_null(self); } } Ok(()) } + + // fn try_extend>>(&mut self, iter: impl arrow_convert::deserialize::IntoArrowArrayIterator>>) -> arrow::error::Result<()> { + fn try_extend(&mut self, iter: impl IntoIterator>>) -> arrow::error::Result<()> { + use arrow_convert::serialize::PushNull; + for i in iter { + self.try_push(i)?; + } + Ok(()) + } + } + }; + + let push_null_impl = if is_dense { + let first_array_type = &mutable_variant_array_types[0]; + let first_name = &variant_names[0]; + quote! { + self.type_ids.push(0); + self.offsets.push((self.#first_name.len()) as i32); + <#first_array_type as PushNull>::push_null(&mut self.#first_name); + } + } else { + quote! { + self.type_ids.push(0); + #( + <#mutable_variant_array_types as PushNull>::push_null(&mut self.#variant_names); + )* + } + }; + + let array_push_null_impl = quote! { + impl arrow_convert::serialize::PushNull for #mutable_array_name { + fn push_null(&mut self) { + use arrow_convert::serialize::PushNull; + #push_null_impl + } } }; @@ -294,99 +364,90 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { } }; - let array_try_extend_impl = quote! { - impl<__T: std::borrow::Borrow<#original_name>> arrow2::array::TryExtend> for #mutable_array_name { - fn try_extend>>(&mut self, iter: I) -> arrow2::error::Result<()> { - use arrow2::array::TryPush; - for i in iter { - self.try_push(i)?; - } - Ok(()) - } - } - }; - let array_mutable_array_impl = quote! { - impl arrow2::array::MutableArray for #mutable_array_name { - fn data_type(&self) -> &arrow2::datatypes::DataType { - &self.data_type - } - + impl arrow::array::ArrayBuilder for #mutable_array_name { fn len(&self) -> usize { - self.types.len() + self.type_ids.len() } - fn validity(&self) -> Option<&arrow2::bitmap::MutableBitmap> { - None - } + fn finish(&mut self) -> arrow::array::ArrayRef { + let arrow::datatypes::DataType::Union(union_fields, _) = + <#original_name as arrow_convert::field::ArrowField>::data_type() + .clone() else { + panic!("datatype is not a union") + }; - fn as_box(&mut self) -> Box { - let values = vec![#( - <#mutable_variant_array_types as arrow2::array::MutableArray>::as_box(&mut self.#variant_names), + let values = [#( + <#mutable_variant_array_types as arrow::array::ArrayBuilder>::finish(&mut self.#variant_names), )*]; - Box::new(arrow2::array::UnionArray::new( - <#original_name as arrow2_convert::field::ArrowField>::data_type().clone(), - std::mem::take(&mut self.types).into(), - values, - #offsets_take - )) + let child_arrays = union_fields.iter() + .map(|(_, field)| field.as_ref().to_owned()) + .zip(values.into_iter()) + .collect::>(); + + let type_ids = arrow::buffer::Buffer::from_vec(std::mem::take(&mut self.type_ids)); + + std::sync::Arc::new(arrow::array::UnionArray::try_new( + &self.field_type_ids, + type_ids, + #offsets_take, + child_arrays + ).unwrap()) } - fn as_arc(&mut self) -> std::sync::Arc { - let values = vec![#( - <#mutable_variant_array_types as arrow2::array::MutableArray>::as_box(&mut self.#variant_names), + fn finish_cloned(&self) -> arrow::array::ArrayRef { + let arrow::datatypes::DataType::Union(union_fields, _) = + <#original_name as arrow_convert::field::ArrowField>::data_type() + .clone() else { + panic!("datatype is not a union") + }; + + let values = [#( + <#mutable_variant_array_types as arrow::array::ArrayBuilder>::finish_cloned(&self.#variant_names), )*]; - std::sync::Arc::new(arrow2::array::UnionArray::new( - <#original_name as arrow2_convert::field::ArrowField>::data_type().clone(), - std::mem::take(&mut self.types).into(), - values, - #offsets_take - )) + let child_arrays = union_fields.iter() + .map(|(_, field)| field.as_ref().to_owned()) + .zip(values.into_iter()) + .collect::>(); + + let type_ids = arrow::buffer::Buffer::from_slice_ref(&self.type_ids); + + std::sync::Arc::new(arrow::array::UnionArray::try_new( + &self.field_type_ids, + type_ids, + #offsets_clone, + child_arrays + ).unwrap()) } fn as_any(&self) -> &dyn std::any::Any { self } - fn as_mut_any(&mut self) -> &mut dyn std::any::Any { + fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } - fn push_null(&mut self) { - use arrow2::array::TryPush; - self.try_push(None::<#original_name>).unwrap(); - } - - fn shrink_to_fit(&mut self) { - #( - <#mutable_variant_array_types as arrow2::array::MutableArray>::shrink_to_fit(&mut self.#variant_names); - )* - self.types.shrink_to_fit(); - #offsets_shrink_to_fit + fn into_box_any(self: Box) -> Box { + self } - fn reserve(&mut self, additional: usize) { - #(<<#variant_types as arrow2_convert::serialize::ArrowSerialize>::MutableArrayType as arrow2::array::MutableArray>::reserve(&mut self.#variant_names, additional);)* - self.types.reserve(additional); - #offsets_reserve - } } }; let field_arrow_serialize_impl = quote! { - impl arrow2_convert::serialize::ArrowSerialize for #original_name { - type MutableArrayType = #mutable_array_name; + impl arrow_convert::serialize::ArrowSerialize for #original_name { + type ArrayBuilderType = #mutable_array_name; #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() } #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - use arrow2::array::TryPush; + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { array.try_push(Some(v)) } } @@ -395,9 +456,8 @@ pub fn expand_serialize(input: DeriveEnum) -> TokenStream { TokenStream::from_iter([ array_decl, array_impl, - array_try_push_impl, + array_push_null_impl, array_default_impl, - array_try_extend_impl, array_mutable_array_impl, field_arrow_serialize_impl, ]) @@ -435,11 +495,11 @@ pub fn expand_deserialize(input: DeriveEnum) -> TokenStream { else { quote! { #lit_idx => { - let mut slice_iter = <<#variant_type as arrow2_convert::deserialize::ArrowDeserialize> ::ArrayType as arrow2_convert::deserialize::ArrowArray> ::iter_from_array_ref(slice.deref()); + let mut slice_iter = <<#variant_type as arrow_convert::deserialize::ArrowDeserialize> ::ArrayType as arrow_convert::deserialize::ArrowArray> ::iter_from_array_ref(slice.deref()); let v = slice_iter .next() .unwrap_or_else(|| panic!("Invalid offset for {}", #lit_idx)); - Some(<#variant_type as arrow2_convert::deserialize::ArrowDeserialize>::arrow_deserialize(v).map(|v| #original_name::#name(v))) + Some(<#variant_type as arrow_convert::deserialize::ArrowDeserialize>::arrow_deserialize(v).map(|v| #original_name::#name(v))) } } } @@ -454,25 +514,25 @@ pub fn expand_deserialize(input: DeriveEnum) -> TokenStream { }; let array_impl = quote! { - impl arrow2_convert::deserialize::ArrowArray for #array_name + impl arrow_convert::deserialize::ArrowArray for #array_name { - type BaseArrayType = arrow2::array::UnionArray; + type BaseArrayType = arrow::array::UnionArray; #[inline] - fn iter_from_array_ref<'a>(b: &'a dyn arrow2::array::Array) -> <&'a Self as IntoIterator>::IntoIter + fn iter_from_array_ref<'a>(b: &'a dyn arrow::array::Array) -> <&'a Self as arrow_convert::deserialize::IntoArrowArrayIterator>::IntoIter { - let arr = b.as_any().downcast_ref::().unwrap(); + let arr = b.as_any().downcast_ref::().unwrap(); #iterator_name { arr, - index_iter: 0..arr.len(), + index_iter: 0..arrow::array::Array::len(&arr), } } } }; let array_into_iterator_impl = quote! { - impl<'a> IntoIterator for &'a #array_name + impl<'a> arrow_convert::deserialize::IntoArrowArrayIterator for &'a #array_name { type Item = Option<#original_name>; type IntoIter = #iterator_name<'a>; @@ -486,7 +546,7 @@ pub fn expand_deserialize(input: DeriveEnum) -> TokenStream { let array_iterator_decl = quote! { #[allow(non_snake_case)] #visibility struct #iterator_name<'a> { - arr: &'a arrow2::array::UnionArray, + arr: &'a arrow::array::UnionArray, index_iter: std::ops::Range, } }; @@ -498,11 +558,10 @@ pub fn expand_deserialize(input: DeriveEnum) -> TokenStream { #[inline] fn next(&mut self) -> Option { use core::ops::Deref; - let Some(next_index) = self.index_iter.next() else { - return None; - }; - let (type_idx, offset) = self.arr.index(next_index); - let slice = self.arr.fields()[type_idx].sliced(offset, 1); + let next_index = self.index_iter.next()?; + let type_idx = self.arr.type_id(next_index); + let offset = self.arr.value_offset(next_index); + let slice = self.arr.child(type_idx).slice(offset, 1); match type_idx { #iter_next_match_block _ => panic!("Invalid type for {}", #original_name_str) @@ -512,7 +571,7 @@ pub fn expand_deserialize(input: DeriveEnum) -> TokenStream { }; let field_arrow_deserialize_impl = quote! { - impl arrow2_convert::deserialize::ArrowDeserialize for #original_name { + impl arrow_convert::deserialize::ArrowDeserialize for #original_name { type ArrayType = #array_name; #[inline] diff --git a/arrow2_convert_derive/src/derive_struct.rs b/arrow_convert_derive/src/derive_struct.rs similarity index 61% rename from arrow2_convert_derive/src/derive_struct.rs rename to arrow_convert_derive/src/derive_struct.rs index a7b73f7..82fd78d 100644 --- a/arrow2_convert_derive/src/derive_struct.rs +++ b/arrow_convert_derive/src/derive_struct.rs @@ -108,31 +108,34 @@ pub fn expand_field(input: DeriveStruct) -> TokenStream { let field = &input.fields[0]; let ty = &field.field_type; quote! ( - <#ty as arrow2_convert::field::ArrowField>::data_type() + <#ty as arrow_convert::field::ArrowField>::data_type() ) } else { let field_names = field_members.iter().map(|field| match field { syn::Member::Named(ident) => format_ident!("{}", ident), syn::Member::Unnamed(index) => format_ident!("field_{}", index), }); - quote!(arrow2::datatypes::DataType::Struct(vec![ - #( - <#field_types as arrow2_convert::field::ArrowField>::field(stringify!(#field_names)), - )* - ])) + quote!(arrow::datatypes::DataType::Struct( + arrow::datatypes::Fields::from(vec![ + #( + <#field_types as arrow_convert::field::ArrowField>::field(stringify!(#field_names)), + )* + ]) + )) } }; quote!( - impl arrow2_convert::field::ArrowField for #original_name { + impl arrow_convert::field::ArrowField for #original_name { type Type = Self; + type Native = Self; - fn data_type() -> arrow2::datatypes::DataType { + fn data_type() -> arrow::datatypes::DataType { #data_type_impl } } - arrow2_convert::arrow_enable_vec_for_type!(#original_name); + arrow_convert::arrow_enable_vec_for_type!(#original_name); ) } @@ -151,7 +154,7 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { let mutable_array_name = &input.common.mutable_array_name(); let mutable_field_array_types = field_types .iter() - .map(|field_type| quote_spanned!( field_type.span() => <#field_type as arrow2_convert::serialize::ArrowSerialize>::MutableArrayType)) + .map(|field_type| quote_spanned!( field_type.span() => <#field_type as arrow_convert::serialize::ArrowSerialize>::ArrayBuilderType)) .collect::>(); let array_decl = quote! { @@ -160,8 +163,8 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { #( #field_idents: #mutable_field_array_types, )* - data_type: arrow2::datatypes::DataType, - validity: Option, + data_type: arrow::datatypes::DataType, + validity: Option, } }; @@ -169,67 +172,55 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { impl #mutable_array_name { pub fn new() -> Self { Self { - #(#field_idents: <#field_types as arrow2_convert::serialize::ArrowSerialize>::new_array(),)* - data_type: <#original_name as arrow2_convert::field::ArrowField>::data_type(), + #(#field_idents: <#field_types as arrow_convert::serialize::ArrowSerialize>::new_array(),)* + data_type: <#original_name as arrow_convert::field::ArrowField>::data_type(), validity: None, } } fn init_validity(&mut self) { - let mut validity = arrow2::bitmap::MutableBitmap::new(); - validity.extend_constant(::len(self), true); - validity.set(::len(self) - 1, false); - self.validity = Some(validity) + let length = ::len(self); + let mut validity = arrow::array::BooleanBufferBuilder::new(length); + validity.append_n(length - 1, true); + validity.append(false); + self.validity = Some(validity); } - } - }; - let array_default_impl = quote! { - impl Default for #mutable_array_name { - fn default() -> Self { - Self::new() + fn data_type(&self) -> &arrow::datatypes::DataType { + &self.data_type + } + + fn append_null(&mut self) { + self.try_push(None::<&#original_name>).unwrap(); + } + + fn validity(&self) -> Option<&arrow::array::BooleanBufferBuilder> { + self.validity.as_ref() } - } - }; - let array_try_push_impl = quote! { - impl<__T: std::borrow::Borrow<#original_name>> arrow2::array::TryPush> for #mutable_array_name { - fn try_push(&mut self, item: Option<__T>) -> arrow2::error::Result<()> { - use arrow2::array::MutableArray; + fn try_push(&mut self, item: Option>) -> arrow::error::Result<()> { + use arrow::array::ArrayBuilder; use std::borrow::Borrow; match item { Some(i) => { let i = i.borrow(); #( - <#field_types as arrow2_convert::serialize::ArrowSerialize>::arrow_serialize(i.#field_names.borrow(), &mut self.#field_idents)?; + <#field_types as arrow_convert::serialize::ArrowSerialize>::arrow_serialize(i.#field_names.borrow(), &mut self.#field_idents)?; )*; match &mut self.validity { - Some(validity) => validity.push(true), + Some(validity) => validity.append(true), None => {} } }, None => { - #( - <#mutable_field_array_types as MutableArray>::push_null(&mut self.#field_idents); - )*; - match &mut self.validity { - Some(validity) => validity.push(false), - None => { - self.init_validity(); - } - } + ::push_null(self); } } Ok(()) } - } - }; - let array_try_extend_impl = quote! { - impl<__T: std::borrow::Borrow<#original_name>> arrow2::array::TryExtend> for #mutable_array_name { - fn try_extend>>(&mut self, iter: I) -> arrow2::error::Result<()> { - use arrow2::array::TryPush; + fn try_extend<'a, I: IntoIterator>>(&mut self, iter: I) -> arrow::error::Result<()> { for i in iter { self.try_push(i)?; } @@ -238,43 +229,77 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { } }; - let first_ident = &field_idents[0]; + let array_default_impl = quote! { + impl Default for #mutable_array_name { + fn default() -> Self { + Self::new() + } + } + }; - let array_mutable_array_impl = quote! { - impl arrow2::array::MutableArray for #mutable_array_name { - fn data_type(&self) -> &arrow2::datatypes::DataType { - &self.data_type + let array_push_null_impl = quote! { + impl arrow_convert::serialize::PushNull for #mutable_array_name { + fn push_null(&mut self) { + use arrow::array::ArrayBuilder; + use arrow_convert::serialize::{ArrowSerialize, PushNull}; + use std::borrow::Borrow; + + #( + // #mutable_field_array_types::append_null(&mut self.#field_idents); + <<#field_types as ArrowSerialize>::ArrayBuilderType as PushNull>::push_null(&mut self.#field_idents); + // self.#field_idents.append_null(); + )*; + match &mut self.validity { + Some(validity) => validity.append(false), + None => { + self.init_validity(); + } + } } + } + }; + let first_ident = &field_idents[0]; + + let array_mutable_array_impl = quote! { + impl arrow::array::ArrayBuilder for #mutable_array_name { fn len(&self) -> usize { self.#first_ident.len() } - fn validity(&self) -> Option<&arrow2::bitmap::MutableBitmap> { - self.validity.as_ref() - } - - fn as_box(&mut self) -> Box { + fn finish(&mut self) -> arrow::array::ArrayRef { let values = vec![#( - <#mutable_field_array_types as arrow2::array::MutableArray>::as_box(&mut self.#field_idents), + <#mutable_field_array_types as arrow::array::ArrayBuilder>::finish(&mut self.#field_idents), )*]; - Box::new(arrow2::array::StructArray::new( - <#original_name as arrow2_convert::field::ArrowField>::data_type().clone(), + let arrow::datatypes::DataType::Struct(fields) = + <#original_name as arrow_convert::field::ArrowField>::data_type() + .clone() else { + panic!("datatype is not struct") + }; + + std::sync::Arc::new(arrow::array::StructArray::new( + fields, values, - std::mem::take(&mut self.validity).map(|x| x.into()), + std::mem::take(&mut self.validity).map(|mut x| x.finish().into()), )) } - fn as_arc(&mut self) -> std::sync::Arc { + fn finish_cloned(&self) -> arrow::array::ArrayRef { let values = vec![#( - <#mutable_field_array_types as arrow2::array::MutableArray>::as_box(&mut self.#field_idents), + <#mutable_field_array_types as arrow::array::ArrayBuilder>::finish_cloned(&self.#field_idents), )*]; - std::sync::Arc::new(arrow2::array::StructArray::new( - <#original_name as arrow2_convert::field::ArrowField>::data_type().clone(), + let arrow::datatypes::DataType::Struct(fields) = + <#original_name as arrow_convert::field::ArrowField>::data_type() + .clone() else { + panic!("datatype is not struct") + }; + + std::sync::Arc::new(arrow::array::StructArray::new( + fields, values, - std::mem::take(&mut self.validity).map(|x| x.into()) + self.validity.as_ref().map(|x| x.finish_cloned().into()) )) } @@ -282,29 +307,12 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { self } - fn as_mut_any(&mut self) -> &mut dyn std::any::Any { + fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } - fn push_null(&mut self) { - use arrow2::array::TryPush; - self.try_push(None::<#original_name>).unwrap(); - } - - fn shrink_to_fit(&mut self) { - #( - <#mutable_field_array_types as arrow2::array::MutableArray>::shrink_to_fit(&mut self.#field_idents); - )* - if let Some(validity) = &mut self.validity { - validity.shrink_to_fit(); - } - } - - fn reserve(&mut self, additional: usize) { - if let Some(x) = self.validity.as_mut() { - x.reserve(additional) - } - #(<<#field_types as arrow2_convert::serialize::ArrowSerialize>::MutableArrayType as arrow2::array::MutableArray>::reserve(&mut self.#field_idents, additional);)* + fn into_box_any(self: Box) -> Box { + self } } }; @@ -314,33 +322,32 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { let first_type = &field_types[0]; // Everything delegates to first field. quote! { - impl arrow2_convert::serialize::ArrowSerialize for #original_name { - type MutableArrayType = <#first_type as arrow2_convert::serialize::ArrowSerialize>::MutableArrayType; + impl arrow_convert::serialize::ArrowSerialize for #original_name { + type ArrayBuilderType = <#first_type as arrow_convert::serialize::ArrowSerialize>::ArrayBuilderType; #[inline] - fn new_array() -> Self::MutableArrayType { - <#first_type as arrow2_convert::serialize::ArrowSerialize>::new_array() + fn new_array() -> Self::ArrayBuilderType { + <#first_type as arrow_convert::serialize::ArrowSerialize>::new_array() } #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - <#first_type as arrow2_convert::serialize::ArrowSerialize>::arrow_serialize(&v.#first_field, array) + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { + <#first_type as arrow_convert::serialize::ArrowSerialize>::arrow_serialize(&v.#first_field, array) } } } } else { let field_arrow_serialize_impl = quote! { - impl arrow2_convert::serialize::ArrowSerialize for #original_name { - type MutableArrayType = #mutable_array_name; + impl arrow_convert::serialize::ArrowSerialize for #original_name { + type ArrayBuilderType = #mutable_array_name; #[inline] - fn new_array() -> Self::MutableArrayType { - Self::MutableArrayType::default() + fn new_array() -> Self::ArrayBuilderType { + Self::ArrayBuilderType::default() } #[inline] - fn arrow_serialize(v: &Self, array: &mut Self::MutableArrayType) -> arrow2::error::Result<()> { - use arrow2::array::TryPush; + fn arrow_serialize(v: &Self, array: &mut Self::ArrayBuilderType) -> arrow::error::Result<()> { array.try_push(Some(v)) } } @@ -349,8 +356,7 @@ pub fn expand_serialize(input: DeriveStruct) -> TokenStream { array_decl, array_impl, array_default_impl, - array_try_push_impl, - array_try_extend_impl, + array_push_null_impl, array_mutable_array_impl, field_arrow_serialize_impl, ]) @@ -379,31 +385,33 @@ pub fn expand_deserialize(input: DeriveStruct) -> TokenStream { }; let array_impl = quote! { - impl arrow2_convert::deserialize::ArrowArray for #array_name + impl arrow_convert::deserialize::ArrowArray for #array_name { - type BaseArrayType = arrow2::array::StructArray; + type BaseArrayType = arrow::array::StructArray; #[inline] - fn iter_from_array_ref<'a>(b: &'a dyn arrow2::array::Array) -> <&'a Self as IntoIterator>::IntoIter + fn iter_from_array_ref<'a>(b: &'a dyn arrow::array::Array) -> <&'a Self as arrow_convert::deserialize::IntoArrowArrayIterator>::IntoIter { use core::ops::Deref; - let arr = b.as_any().downcast_ref::().unwrap(); - let values = arr.values(); - let validity = arr.validity(); + use arrow::array::Array; + + let arr = b.as_any().downcast_ref::().unwrap(); + let values = arr.columns(); + let validity = arr.nulls(); // for now do a straight comp #iterator_name { #( - #field_idents: <<#field_types as arrow2_convert::deserialize::ArrowDeserialize>::ArrayType as arrow2_convert::deserialize::ArrowArray>::iter_from_array_ref(values[#field_indices].deref()), + #field_idents: <<#field_types as arrow_convert::deserialize::ArrowDeserialize>::ArrayType as arrow_convert::deserialize::ArrowArray>::iter_from_array_ref(values[#field_indices].deref()), )* has_validity: validity.as_ref().is_some(), - validity_iter: validity.as_ref().map(|x| x.iter()).unwrap_or_else(|| arrow2::bitmap::utils::BitmapIter::new(&[], 0, 0)) + validity_iter: validity.as_ref().map(|x| x.iter()).unwrap_or_else(|| arrow::util::bit_iterator::BitIterator::new(&[], 0, 0)) } } } }; let array_into_iterator_impl = quote! { - impl<'a> IntoIterator for &'a #array_name + impl<'a> arrow_convert::deserialize::IntoArrowArrayIterator for &'a #array_name { type Item = Option<#original_name>; type IntoIter = #iterator_name<'a>; @@ -417,9 +425,9 @@ pub fn expand_deserialize(input: DeriveStruct) -> TokenStream { let iterator_decl = quote! { #visibility struct #iterator_name<'a> { #( - #field_idents: <&'a <#field_types as arrow2_convert::deserialize::ArrowDeserialize>::ArrayType as IntoIterator>::IntoIter, + #field_idents: <&'a <#field_types as arrow_convert::deserialize::ArrowDeserialize>::ArrayType as arrow_convert::deserialize::IntoArrowArrayIterator>::IntoIter, )* - validity_iter: arrow2::bitmap::utils::BitmapIter<'a>, + validity_iter: arrow::util::bit_iterator::BitIterator<'a>, has_validity: bool } }; @@ -428,13 +436,13 @@ pub fn expand_deserialize(input: DeriveStruct) -> TokenStream { // If the fields are unnamed, we create a tuple-struct syn::parse_quote! { #original_name ( - #(<#field_types as arrow2_convert::deserialize::ArrowDeserialize>::arrow_deserialize_internal(#field_idents),)* + #(<#field_types as arrow_convert::deserialize::ArrowDeserialize>::arrow_deserialize_internal(#field_idents),)* ) } } else { syn::parse_quote! { #original_name { - #(#field_names: <#field_types as arrow2_convert::deserialize::ArrowDeserialize>::arrow_deserialize_internal(#field_idents),)* + #(#field_names: <#field_types as arrow_convert::deserialize::ArrowDeserialize>::arrow_deserialize_internal(#field_idents),)* #(#skipped_field_names: std::default::Default::default(),)* } } @@ -490,18 +498,18 @@ pub fn expand_deserialize(input: DeriveStruct) -> TokenStream { // Everything delegates to first field. quote! { - impl arrow2_convert::deserialize::ArrowDeserialize for #original_name { - type ArrayType = <#first_type as arrow2_convert::deserialize::ArrowDeserialize>::ArrayType; + impl arrow_convert::deserialize::ArrowDeserialize for #original_name { + type ArrayType = <#first_type as arrow_convert::deserialize::ArrowDeserialize>::ArrayType; #[inline] - fn arrow_deserialize<'a>(v: <&Self::ArrayType as IntoIterator>::Item) -> Option { - <#first_type as arrow2_convert::deserialize::ArrowDeserialize>::arrow_deserialize(v).map(#deser_body_mapper) + fn arrow_deserialize<'a>(v: <&Self::ArrayType as arrow_convert::deserialize::IntoArrowArrayIterator>::Item) -> Option { + <#first_type as arrow_convert::deserialize::ArrowDeserialize>::arrow_deserialize(v).map(#deser_body_mapper) } } } } else { let field_arrow_deserialize_impl = quote! { - impl arrow2_convert::deserialize::ArrowDeserialize for #original_name { + impl arrow_convert::deserialize::ArrowDeserialize for #original_name { type ArrayType = #array_name; #[inline] diff --git a/arrow2_convert_derive/src/input.rs b/arrow_convert_derive/src/input.rs similarity index 100% rename from arrow2_convert_derive/src/input.rs rename to arrow_convert_derive/src/input.rs diff --git a/arrow2_convert_derive/src/lib.rs b/arrow_convert_derive/src/lib.rs similarity index 86% rename from arrow2_convert_derive/src/lib.rs rename to arrow_convert_derive/src/lib.rs index 21c573b..f431c45 100644 --- a/arrow2_convert_derive/src/lib.rs +++ b/arrow_convert_derive/src/lib.rs @@ -9,7 +9,7 @@ use input::*; /// Derive macro for arrow fields #[proc_macro_error] #[proc_macro_derive(ArrowField, attributes(arrow_field))] -pub fn arrow2_convert_derive_field(input: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn arrow_convert_derive_field(input: proc_macro::TokenStream) -> proc_macro::TokenStream { let ast: syn::DeriveInput = syn::parse(input).unwrap(); match &ast.data { @@ -24,7 +24,7 @@ pub fn arrow2_convert_derive_field(input: proc_macro::TokenStream) -> proc_macro /// Derive macro for arrow serialize #[proc_macro_error] #[proc_macro_derive(ArrowSerialize, attributes(arrow_field))] -pub fn arrow2_convert_derive_serialize(input: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn arrow_convert_derive_serialize(input: proc_macro::TokenStream) -> proc_macro::TokenStream { let ast: syn::DeriveInput = syn::parse(input).unwrap(); match &ast.data { @@ -41,9 +41,7 @@ pub fn arrow2_convert_derive_serialize(input: proc_macro::TokenStream) -> proc_m /// Derive macro for arrow deserialize #[proc_macro_error] #[proc_macro_derive(ArrowDeserialize, attributes(arrow_field))] -pub fn arrow2_convert_derive_deserialize( - input: proc_macro::TokenStream, -) -> proc_macro::TokenStream { +pub fn arrow_convert_derive_deserialize(input: proc_macro::TokenStream) -> proc_macro::TokenStream { let ast: syn::DeriveInput = syn::parse(input).unwrap(); match &ast.data { diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml index d1bf697..e745ce3 100644 --- a/examples/simple/Cargo.toml +++ b/examples/simple/Cargo.toml @@ -1,10 +1,10 @@ [package] -name = "arrow2_convert_example_simple" -version = "0.1.0" +name = "arrow_convert_example_simple" +version = "0.2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow2 = "0.17" -arrow2_convert = { version = "0.5", path = "../../arrow2_convert" } +arrow = "51.0" +arrow_convert = { version = "0.6", path = "../../arrow_convert" } diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index ea1b8ba..4b3dc52 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -1,6 +1,6 @@ /// Simple example -use arrow2::array::Array; -use arrow2_convert::{ +use arrow::array::{Array, ArrayRef}; +use arrow_convert::{ deserialize::TryIntoCollection, serialize::TryIntoArrow, ArrowDeserialize, ArrowField, ArrowSerialize, }; @@ -25,13 +25,13 @@ fn main() { ]; // serialize to an arrow array. try_into_arrow() is enabled by the TryIntoArrow trait - let arrow_array: Box = original_array.try_into_arrow().unwrap(); + let arrow_array: ArrayRef = original_array.try_into_arrow().unwrap(); // which can be cast to an Arrow StructArray and be used for all kinds of IPC, FFI, etc. - // supported by `arrow2` + // supported by `arrow` let struct_array = arrow_array .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(struct_array.len(), 3); diff --git a/rust-toolchain b/rust-toolchain index 4934985..79e15fd 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -1.69.0 +1.77.0