Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions datafusion/common/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ use crate::{downcast_value, Result};
use arrow::array::{
BinaryViewArray, Decimal32Array, Decimal64Array, DurationMicrosecondArray,
DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array,
Int16Array, Int8Array, LargeBinaryArray, LargeStringArray, StringViewArray,
UInt16Array,
Int16Array, Int8Array, LargeBinaryArray, LargeListViewArray, LargeStringArray,
ListViewArray, StringViewArray, UInt16Array,
};
use arrow::{
array::{
Expand Down Expand Up @@ -324,3 +324,13 @@ pub fn as_generic_string_array<T: OffsetSizeTrait>(
) -> Result<&GenericStringArray<T>> {
Ok(downcast_value!(array, GenericStringArray, T))
}

// Downcast Array to ListViewArray
pub fn as_list_view_array(array: &dyn Array) -> Result<&ListViewArray> {
Ok(downcast_value!(array, ListViewArray))
}

// Downcast Array to LargeListViewArray
pub fn as_large_list_view_array(array: &dyn Array) -> Result<&LargeListViewArray> {
Ok(downcast_value!(array, LargeListViewArray))
}
218 changes: 213 additions & 5 deletions datafusion/functions-nested/src/reverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@

use crate::utils::make_scalar_function;
use arrow::array::{
Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData,
OffsetSizeTrait,
Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray,
GenericListViewArray, MutableArrayData, OffsetSizeTrait, UInt32Array,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow::compute::take;
use arrow::datatypes::DataType::{
FixedSizeList, LargeList, LargeListView, List, ListView, Null,
};
use arrow::buffer::OffsetBuffer;
use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null};
use arrow::datatypes::{DataType, FieldRef};
use datafusion_common::cast::{
as_fixed_size_list_array, as_large_list_array, as_list_array,
as_fixed_size_list_array, as_large_list_array, as_large_list_view_array,
as_list_array, as_list_view_array,
};
use datafusion_common::{exec_err, utils::take_function_args, Result};
use datafusion_expr::{
Expand Down Expand Up @@ -133,6 +137,14 @@ pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
fixed_size_array_reverse(array, field)
}
Null => Ok(Arc::clone(input_array)),
ListView(field) => {
let array = as_list_view_array(input_array)?;
list_view_reverse::<i32>(array, field)
}
LargeListView(field) => {
let array = as_large_list_view_array(input_array)?;
list_view_reverse::<i64>(array, field)
}
array_type => exec_err!("array_reverse does not support type '{array_type}'."),
}
}
Expand Down Expand Up @@ -183,6 +195,75 @@ fn general_array_reverse<O: OffsetSizeTrait + TryFrom<i64>>(
)?))
}

fn list_view_reverse<O: OffsetSizeTrait + TryFrom<i64>>(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to admit I did not spend much time looking at general_array_reverse, maybe I should have... It constructs a MutableArrayData and operates on it, while this uses take. There might be a good reason why general_array_reverse doesn't use take?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using take makes sense to me here. If you have time you could try both approaches and run a benchmark?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, I will try that

Copy link
Contributor Author

@vegarsti vegarsti Nov 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, I tried the benchmark here #18425 with array_len of 10k (it's 100k on the branch), using MutableData is way worse 🫨 Baseline is the code on this PR, the code from this snippet is here.

array_reverse           time:   [44.858 µs 44.865 µs 44.874 µs]
                        change: [+545.75% +547.46% +549.17%] (p = 0.00 < 0.05)
                        Performance has regressed.

This indicates it might be worth using take instead of MutableData on the regular array one too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to go with take in that case

array: &GenericListViewArray<O>,
field: &FieldRef,
) -> Result<ArrayRef> {
let (_, offsets, sizes, values, nulls) = array.clone().into_parts();

// Construct indices, sizes and offsets for the reversed array by iterating over
// the list view array in the logical order, and reversing the order of the elements.
// We end up with a list view array where the elements are in order,
// even if the original array had elements out of order.
let mut indices: Vec<O> = Vec::with_capacity(values.len());
let mut new_sizes = Vec::with_capacity(sizes.len());
let mut new_offsets: Vec<O> = Vec::with_capacity(offsets.len());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW you can convert the existing indices to a Vec for in place update as well -- I tried to document that a bit more in apache/arrow-rs#8771

let mut new_nulls =
Vec::with_capacity(nulls.clone().map(|nulls| nulls.len()).unwrap_or(0));
new_offsets.push(O::zero());
let has_nulls = nulls.is_some();
for (i, offset) in offsets.iter().enumerate().take(offsets.len()) {
// If this array is null, we set the new array to null with size 0 and continue
if let Some(ref nulls) = nulls {
if nulls.is_null(i) {
new_nulls.push(false); // null
new_sizes.push(O::zero());
new_offsets.push(new_offsets[i]);
continue;
} else {
new_nulls.push(true); // valid
}
}

// Each array is located at [offset, offset + size), so we collect indices in the reverse order
let array_start = offset.as_usize();
let array_end = array_start + sizes[i].as_usize();
for idx in (array_start..array_end).rev() {
indices.push(O::usize_as(idx));
}
new_sizes.push(sizes[i]);
if i < sizes.len() - 1 {
new_offsets.push(new_offsets[i] + sizes[i]);
}
}

// Materialize values from underlying array with take
let indices_array: ArrayRef = if O::IS_LARGE {
Arc::new(arrow::array::UInt64Array::from(
indices
.iter()
.map(|i| i.as_usize() as u64)
.collect::<Vec<_>>(),
))
} else {
Arc::new(UInt32Array::from(
indices
.iter()
.map(|i| i.as_usize() as u32)
.collect::<Vec<_>>(),
))
};
Comment on lines +240 to +255
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I do wonder if there is a better way to do this, will keep thinking 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this feels cumbersome!

let values_reversed = take(&values, &indices_array, None)?;

Ok(Arc::new(GenericListViewArray::<O>::try_new(
Arc::clone(field),
ScalarBuffer::from(new_offsets),
ScalarBuffer::from(new_sizes),
values_reversed,
has_nulls.then_some(NullBuffer::from(new_nulls)),
)?))
}

fn fixed_size_array_reverse(
array: &FixedSizeListArray,
field: &FieldRef,
Expand Down Expand Up @@ -219,3 +300,130 @@ fn fixed_size_array_reverse(
Some(nulls.into()),
)?))
}

#[cfg(test)]
mod tests {
use crate::reverse::list_view_reverse;
use arrow::{
array::{
AsArray, GenericListViewArray, Int32Array, LargeListViewArray, ListViewArray,
OffsetSizeTrait,
},
buffer::{NullBuffer, ScalarBuffer},
datatypes::{DataType, Field, Int32Type},
};
use datafusion_common::Result;
use std::sync::Arc;

fn list_view_values<O: OffsetSizeTrait + TryFrom<i64>>(
array: &GenericListViewArray<O>,
) -> Vec<Option<Vec<i32>>> {
array
.iter()
.map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
.collect()
}

#[test]
fn test_reverse_list_view() -> Result<()> {
let field = Arc::new(Field::new("a", DataType::Int32, false));
let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]);
let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]);
let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
let result = list_view_reverse(
&list_view,
&Arc::new(Field::new("test", DataType::Int32, true)),
)?;
let reversed = list_view_values(&result.as_list_view::<i32>());
let expected = vec![
Some(vec![1]),
Some(vec![6, 5, 4, 3, 2]),
None,
Some(vec![9, 8, 7]),
];
assert_eq!(expected, reversed);
Ok(())
}

#[test]
fn test_reverse_large_list_view() -> Result<()> {
let field = Arc::new(Field::new("a", DataType::Int32, false));
let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]);
let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]);
let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
let list_view = LargeListViewArray::new(field, offsets, sizes, values, nulls);
let result = list_view_reverse(
&list_view,
&Arc::new(Field::new("test", DataType::Int32, true)),
)?;
let reversed = list_view_values(&result.as_list_view::<i64>());
let expected = vec![
Some(vec![1]),
Some(vec![6, 5, 4, 3, 2]),
None,
Some(vec![9, 8, 7]),
];
assert_eq!(expected, reversed);
Ok(())
}

#[test]
fn test_reverse_list_view_out_of_order() -> Result<()> {
let field = Arc::new(Field::new("a", DataType::Int32, false));
let offsets = ScalarBuffer::from(vec![6, 1, 6, 0]); // out of order
let sizes = ScalarBuffer::from(vec![3, 5, 0, 1]);
let values = Arc::new(Int32Array::from(vec![
1, // fourth array: offset 0, size 1
2, 3, 4, 5, 6, // second array: offset 1, size 5
// third array: offset 6, size 0 (and null)
7, 8, 9, // first array: offset 6, size 3
]));
let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
let result = list_view_reverse(
&list_view,
&Arc::new(Field::new("test", DataType::Int32, true)),
)?;
let reversed = list_view_values(&result.as_list_view::<i32>());
let expected = vec![
Some(vec![9, 8, 7]),
Some(vec![6, 5, 4, 3, 2]),
None,
Some(vec![1]),
];
assert_eq!(expected, reversed);
Ok(())
}

#[test]
fn test_reverse_list_view_with_nulls() -> Result<()> {
let field = Arc::new(Field::new("a", DataType::Int32, false));
let offsets = ScalarBuffer::from(vec![16, 1, 6, 0]); // out of order
let sizes = ScalarBuffer::from(vec![3, 5, 10, 1]);
let values = Arc::new(Int32Array::from(vec![
1, // fourth array: offset 0, size 1
2, 3, 4, 5, 6, // second array: offset 1, size 5
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // third array: offset 6, size 10
7, 8, 9, // first array: offset 6, size 3
]));
let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
let result = list_view_reverse(
&list_view,
&Arc::new(Field::new("test", DataType::Int32, true)),
)?;
let result = result.as_list_view::<i32>();
let reversed = list_view_values(&result);
let expected = vec![
Some(vec![9, 8, 7]),
Some(vec![6, 5, 4, 3, 2]),
None,
Some(vec![1]),
];
assert_eq!(expected, reversed);
Ok(())
}
}
Loading