-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Support reverse for ListView #18424
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support reverse for ListView #18424
Changes from 2 commits
0175167
48912d9
b16ce4d
714f9e7
e72469f
71aefa9
7d39f69
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,14 +19,18 @@ | |
|
|
||
| use crate::utils::make_scalar_function; | ||
| use arrow::array::{ | ||
| Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData, | ||
| OffsetSizeTrait, | ||
| Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, | ||
| GenericListViewArray, MutableArrayData, OffsetSizeTrait, UInt32Array, | ||
| }; | ||
| use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; | ||
| use arrow::compute::take; | ||
| use arrow::datatypes::DataType::{ | ||
| FixedSizeList, LargeList, LargeListView, List, ListView, Null, | ||
| }; | ||
| use arrow::buffer::OffsetBuffer; | ||
| use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null}; | ||
| use arrow::datatypes::{DataType, FieldRef}; | ||
| use datafusion_common::cast::{ | ||
| as_fixed_size_list_array, as_large_list_array, as_list_array, | ||
| as_fixed_size_list_array, as_large_list_array, as_large_list_view_array, | ||
| as_list_array, as_list_view_array, | ||
| }; | ||
| use datafusion_common::{exec_err, utils::take_function_args, Result}; | ||
| use datafusion_expr::{ | ||
|
|
@@ -133,6 +137,14 @@ pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result<ArrayRef> { | |
| fixed_size_array_reverse(array, field) | ||
| } | ||
| Null => Ok(Arc::clone(input_array)), | ||
| ListView(field) => { | ||
| let array = as_list_view_array(input_array)?; | ||
| list_view_reverse::<i32>(array, field) | ||
| } | ||
| LargeListView(field) => { | ||
| let array = as_large_list_view_array(input_array)?; | ||
| list_view_reverse::<i64>(array, field) | ||
| } | ||
| array_type => exec_err!("array_reverse does not support type '{array_type}'."), | ||
| } | ||
| } | ||
|
|
@@ -183,6 +195,75 @@ fn general_array_reverse<O: OffsetSizeTrait + TryFrom<i64>>( | |
| )?)) | ||
| } | ||
|
|
||
| fn list_view_reverse<O: OffsetSizeTrait + TryFrom<i64>>( | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| array: &GenericListViewArray<O>, | ||
| field: &FieldRef, | ||
| ) -> Result<ArrayRef> { | ||
| let (_, offsets, sizes, values, nulls) = array.clone().into_parts(); | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| // Construct indices, sizes and offsets for the reversed array by iterating over | ||
| // the list view array in the logical order, and reversing the order of the elements. | ||
| // We end up with a list view array where the elements are in order, | ||
| // even if the original array had elements out of order. | ||
| let mut indices: Vec<O> = Vec::with_capacity(values.len()); | ||
| let mut new_sizes = Vec::with_capacity(sizes.len()); | ||
| let mut new_offsets: Vec<O> = Vec::with_capacity(offsets.len()); | ||
|
||
| let mut new_nulls = | ||
| Vec::with_capacity(nulls.clone().map(|nulls| nulls.len()).unwrap_or(0)); | ||
| new_offsets.push(O::zero()); | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| let has_nulls = nulls.is_some(); | ||
| for (i, offset) in offsets.iter().enumerate().take(offsets.len()) { | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // If this array is null, we set the new array to null with size 0 and continue | ||
| if let Some(ref nulls) = nulls { | ||
| if nulls.is_null(i) { | ||
| new_nulls.push(false); // null | ||
| new_sizes.push(O::zero()); | ||
| new_offsets.push(new_offsets[i]); | ||
| continue; | ||
| } else { | ||
| new_nulls.push(true); // valid | ||
| } | ||
| } | ||
|
|
||
| // Each array is located at [offset, offset + size), so we collect indices in the reverse order | ||
| let array_start = offset.as_usize(); | ||
| let array_end = array_start + sizes[i].as_usize(); | ||
| for idx in (array_start..array_end).rev() { | ||
| indices.push(O::usize_as(idx)); | ||
| } | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| new_sizes.push(sizes[i]); | ||
| if i < sizes.len() - 1 { | ||
| new_offsets.push(new_offsets[i] + sizes[i]); | ||
| } | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // Materialize values from underlying array with take | ||
| let indices_array: ArrayRef = if O::IS_LARGE { | ||
| Arc::new(arrow::array::UInt64Array::from( | ||
| indices | ||
| .iter() | ||
| .map(|i| i.as_usize() as u64) | ||
| .collect::<Vec<_>>(), | ||
| )) | ||
| } else { | ||
| Arc::new(UInt32Array::from( | ||
| indices | ||
| .iter() | ||
| .map(|i| i.as_usize() as u32) | ||
| .collect::<Vec<_>>(), | ||
| )) | ||
| }; | ||
|
Comment on lines
+240
to
+255
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm I do wonder if there is a better way to do this, will keep thinking 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this feels cumbersome! |
||
| let values_reversed = take(&values, &indices_array, None)?; | ||
|
|
||
| Ok(Arc::new(GenericListViewArray::<O>::try_new( | ||
| Arc::clone(field), | ||
| ScalarBuffer::from(new_offsets), | ||
| ScalarBuffer::from(new_sizes), | ||
| values_reversed, | ||
| has_nulls.then_some(NullBuffer::from(new_nulls)), | ||
vegarsti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| )?)) | ||
| } | ||
|
|
||
| fn fixed_size_array_reverse( | ||
| array: &FixedSizeListArray, | ||
| field: &FieldRef, | ||
|
|
@@ -219,3 +300,130 @@ fn fixed_size_array_reverse( | |
| Some(nulls.into()), | ||
| )?)) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use crate::reverse::list_view_reverse; | ||
| use arrow::{ | ||
| array::{ | ||
| AsArray, GenericListViewArray, Int32Array, LargeListViewArray, ListViewArray, | ||
| OffsetSizeTrait, | ||
| }, | ||
| buffer::{NullBuffer, ScalarBuffer}, | ||
| datatypes::{DataType, Field, Int32Type}, | ||
| }; | ||
| use datafusion_common::Result; | ||
| use std::sync::Arc; | ||
|
|
||
| fn list_view_values<O: OffsetSizeTrait + TryFrom<i64>>( | ||
| array: &GenericListViewArray<O>, | ||
| ) -> Vec<Option<Vec<i32>>> { | ||
| array | ||
| .iter() | ||
| .map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec())) | ||
| .collect() | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_reverse_list_view() -> Result<()> { | ||
| let field = Arc::new(Field::new("a", DataType::Int32, false)); | ||
| let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]); | ||
| let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]); | ||
| let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9])); | ||
| let nulls = Some(NullBuffer::from(vec![true, true, false, true])); | ||
| let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); | ||
| let result = list_view_reverse( | ||
| &list_view, | ||
| &Arc::new(Field::new("test", DataType::Int32, true)), | ||
| )?; | ||
| let reversed = list_view_values(&result.as_list_view::<i32>()); | ||
| let expected = vec![ | ||
| Some(vec![1]), | ||
| Some(vec![6, 5, 4, 3, 2]), | ||
| None, | ||
| Some(vec![9, 8, 7]), | ||
| ]; | ||
| assert_eq!(expected, reversed); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_reverse_large_list_view() -> Result<()> { | ||
| let field = Arc::new(Field::new("a", DataType::Int32, false)); | ||
| let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]); | ||
| let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]); | ||
| let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9])); | ||
| let nulls = Some(NullBuffer::from(vec![true, true, false, true])); | ||
| let list_view = LargeListViewArray::new(field, offsets, sizes, values, nulls); | ||
| let result = list_view_reverse( | ||
| &list_view, | ||
| &Arc::new(Field::new("test", DataType::Int32, true)), | ||
| )?; | ||
| let reversed = list_view_values(&result.as_list_view::<i64>()); | ||
| let expected = vec![ | ||
| Some(vec![1]), | ||
| Some(vec![6, 5, 4, 3, 2]), | ||
| None, | ||
| Some(vec![9, 8, 7]), | ||
| ]; | ||
| assert_eq!(expected, reversed); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_reverse_list_view_out_of_order() -> Result<()> { | ||
| let field = Arc::new(Field::new("a", DataType::Int32, false)); | ||
| let offsets = ScalarBuffer::from(vec![6, 1, 6, 0]); // out of order | ||
| let sizes = ScalarBuffer::from(vec![3, 5, 0, 1]); | ||
| let values = Arc::new(Int32Array::from(vec![ | ||
| 1, // fourth array: offset 0, size 1 | ||
| 2, 3, 4, 5, 6, // second array: offset 1, size 5 | ||
| // third array: offset 6, size 0 (and null) | ||
| 7, 8, 9, // first array: offset 6, size 3 | ||
| ])); | ||
| let nulls = Some(NullBuffer::from(vec![true, true, false, true])); | ||
| let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); | ||
| let result = list_view_reverse( | ||
| &list_view, | ||
| &Arc::new(Field::new("test", DataType::Int32, true)), | ||
| )?; | ||
| let reversed = list_view_values(&result.as_list_view::<i32>()); | ||
| let expected = vec![ | ||
| Some(vec![9, 8, 7]), | ||
| Some(vec![6, 5, 4, 3, 2]), | ||
| None, | ||
| Some(vec![1]), | ||
| ]; | ||
| assert_eq!(expected, reversed); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_reverse_list_view_with_nulls() -> Result<()> { | ||
| let field = Arc::new(Field::new("a", DataType::Int32, false)); | ||
| let offsets = ScalarBuffer::from(vec![16, 1, 6, 0]); // out of order | ||
| let sizes = ScalarBuffer::from(vec![3, 5, 10, 1]); | ||
| let values = Arc::new(Int32Array::from(vec![ | ||
| 1, // fourth array: offset 0, size 1 | ||
| 2, 3, 4, 5, 6, // second array: offset 1, size 5 | ||
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // third array: offset 6, size 10 | ||
| 7, 8, 9, // first array: offset 6, size 3 | ||
| ])); | ||
| let nulls = Some(NullBuffer::from(vec![true, true, false, true])); | ||
| let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); | ||
| let result = list_view_reverse( | ||
| &list_view, | ||
| &Arc::new(Field::new("test", DataType::Int32, true)), | ||
| )?; | ||
| let result = result.as_list_view::<i32>(); | ||
| let reversed = list_view_values(&result); | ||
| let expected = vec![ | ||
| Some(vec![9, 8, 7]), | ||
| Some(vec![6, 5, 4, 3, 2]), | ||
| None, | ||
| Some(vec![1]), | ||
| ]; | ||
| assert_eq!(expected, reversed); | ||
| Ok(()) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have to admit I did not spend much time looking at
general_array_reverse, maybe I should have... It constructs aMutableArrayDataand operates on it, while this usestake. There might be a good reason whygeneral_array_reversedoesn't usetake?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using
takemakes sense to me here. If you have time you could try both approaches and run a benchmark?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea, I will try that
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, I tried the benchmark here #18425 with
array_lenof 10k (it's 100k on the branch), using MutableData is way worse 🫨 Baseline is the code on this PR, the code from this snippet is here.This indicates it might be worth using
takeinstead ofMutableDataon the regular array one too.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good to go with take in that case