Skip to content

Commit e50907a

Browse files
authored
Use generics in create_hashes (#42)
1 parent 0e98891 commit e50907a

File tree

5 files changed

+96
-80
lines changed

5 files changed

+96
-80
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 84 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ fn hash_dictionary<K: ArrowDictionaryKeyType>(
214214
// redundant hashing for large dictionary elements (e.g. strings)
215215
let dict_values = array.values();
216216
let mut dict_hashes = vec![0; dict_values.len()];
217-
create_hashes_from_arrays(&[dict_values.as_ref()], random_state, &mut dict_hashes)?;
217+
create_hashes([dict_values], random_state, &mut dict_hashes)?;
218218

219219
// combine hash for each index in values
220220
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
@@ -308,7 +308,7 @@ where
308308
let offsets = array.value_offsets();
309309
let nulls = array.nulls();
310310
let mut values_hashes = vec![0u64; values.len()];
311-
create_hashes_from_arrays(&[values.as_ref()], random_state, &mut values_hashes)?;
311+
create_hashes([values], random_state, &mut values_hashes)?;
312312
if let Some(nulls) = nulls {
313313
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
314314
if nulls.is_valid(i) {
@@ -339,7 +339,7 @@ fn hash_fixed_list_array(
339339
let value_length = array.value_length() as usize;
340340
let nulls = array.nulls();
341341
let mut values_hashes = vec![0u64; values.len()];
342-
create_hashes_from_arrays(&[values.as_ref()], random_state, &mut values_hashes)?;
342+
create_hashes([values], random_state, &mut values_hashes)?;
343343
if let Some(nulls) = nulls {
344344
for i in 0..array.len() {
345345
if nulls.is_valid(i) {
@@ -434,41 +434,51 @@ fn hash_single_array(
434434
Ok(())
435435
}
436436

437-
/// Creates hash values for every row, based on the values in the columns.
438-
///
439-
/// The number of rows to hash is determined by `hashes_buffer.len()`.
440-
/// `hashes_buffer` should be pre-sized appropriately
441-
///
442-
/// This is the same as [`create_hashes`] but accepts `&dyn Array`s instead of requiring
443-
/// `ArrayRef`s.
444-
pub fn create_hashes_from_arrays<'a>(
445-
arrays: &[&dyn Array],
446-
random_state: &RandomState,
447-
hashes_buffer: &'a mut Vec<u64>,
448-
) -> Result<&'a mut Vec<u64>> {
449-
for (i, &array) in arrays.iter().enumerate() {
450-
// combine hashes with `combine_hashes` for all columns besides the first
451-
let rehash = i >= 1;
452-
hash_single_array(array, random_state, hashes_buffer, rehash)?;
437+
pub trait AsDynArray {
438+
fn as_dyn_array(&self) -> &dyn Array;
439+
}
440+
441+
impl AsDynArray for dyn Array {
442+
fn as_dyn_array(&self) -> &dyn Array {
443+
self
444+
}
445+
}
446+
447+
impl AsDynArray for &dyn Array {
448+
fn as_dyn_array(&self) -> &dyn Array {
449+
*self
450+
}
451+
}
452+
453+
impl AsDynArray for ArrayRef {
454+
fn as_dyn_array(&self) -> &dyn Array {
455+
self.as_ref()
456+
}
457+
}
458+
459+
impl AsDynArray for &ArrayRef {
460+
fn as_dyn_array(&self) -> &dyn Array {
461+
self.as_ref()
453462
}
454-
Ok(hashes_buffer)
455463
}
456464

457465
/// Creates hash values for every row, based on the values in the columns.
458466
///
459467
/// The number of rows to hash is determined by `hashes_buffer.len()`.
460468
/// `hashes_buffer` should be pre-sized appropriately.
461-
///
462-
/// This is the same as [`create_hashes_from_arrays`] but accepts `ArrayRef`s.
463-
pub fn create_hashes<'a>(
464-
arrays: &[ArrayRef],
469+
pub fn create_hashes<'a, I, T>(
470+
arrays: I,
465471
random_state: &RandomState,
466472
hashes_buffer: &'a mut Vec<u64>,
467-
) -> Result<&'a mut Vec<u64>> {
468-
for (i, array) in arrays.iter().enumerate() {
473+
) -> Result<&'a mut Vec<u64>>
474+
where
475+
I: IntoIterator<Item = T>,
476+
T: AsDynArray,
477+
{
478+
for (i, array) in arrays.into_iter().enumerate() {
469479
// combine hashes with `combine_hashes` for all columns besides the first
470480
let rehash = i >= 1;
471-
hash_single_array(array.as_ref(), random_state, hashes_buffer, rehash)?;
481+
hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
472482
}
473483
Ok(hashes_buffer)
474484
}
@@ -491,7 +501,7 @@ mod tests {
491501
.collect::<Decimal128Array>()
492502
.with_precision_and_scale(20, 3)
493503
.unwrap();
494-
let array_ref = Arc::new(array);
504+
let array_ref: ArrayRef = Arc::new(array);
495505
let random_state = RandomState::with_seeds(0, 0, 0, 0);
496506
let hashes_buff = &mut vec![0; array_ref.len()];
497507
let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
@@ -504,15 +514,21 @@ mod tests {
504514
let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
505515
let random_state = RandomState::with_seeds(0, 0, 0, 0);
506516
let hashes_buff = &mut vec![0; 0];
507-
let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?;
517+
let hashes = create_hashes(
518+
&[Arc::new(empty_array) as ArrayRef],
519+
&random_state,
520+
hashes_buff,
521+
)?;
508522
assert_eq!(hashes, &Vec::<u64>::new());
509523
Ok(())
510524
}
511525

512526
#[test]
513527
fn create_hashes_for_float_arrays() -> Result<()> {
514-
let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
515-
let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
528+
let f32_arr: ArrayRef =
529+
Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
530+
let f64_arr: ArrayRef =
531+
Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
516532

517533
let random_state = RandomState::with_seeds(0, 0, 0, 0);
518534
let hashes_buff = &mut vec![0; f32_arr.len()];
@@ -540,8 +556,10 @@ mod tests {
540556
Some(b"Longer than 12 bytes string"),
541557
];
542558

543-
let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
544-
let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
559+
let binary_array: ArrayRef =
560+
Arc::new(binary.iter().cloned().collect::<$ARRAY>());
561+
let ref_array: ArrayRef =
562+
Arc::new(binary.iter().cloned().collect::<BinaryArray>());
545563

546564
let random_state = RandomState::with_seeds(0, 0, 0, 0);
547565

@@ -579,7 +597,7 @@ mod tests {
579597
#[test]
580598
fn create_hashes_fixed_size_binary() -> Result<()> {
581599
let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
582-
let fixed_size_binary_array =
600+
let fixed_size_binary_array: ArrayRef =
583601
Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
584602

585603
let random_state = RandomState::with_seeds(0, 0, 0, 0);
@@ -606,8 +624,9 @@ mod tests {
606624
Some("Longer than 12 bytes string"),
607625
];
608626

609-
let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
610-
let dict_array = Arc::new(
627+
let string_array: ArrayRef =
628+
Arc::new(strings.iter().cloned().collect::<$ARRAY>());
629+
let dict_array: ArrayRef = Arc::new(
611630
strings
612631
.iter()
613632
.cloned()
@@ -655,8 +674,9 @@ mod tests {
655674
fn create_hashes_for_dict_arrays() {
656675
let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
657676

658-
let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
659-
let dict_array = Arc::new(
677+
let string_array: ArrayRef =
678+
Arc::new(strings.iter().cloned().collect::<StringArray>());
679+
let dict_array: ArrayRef = Arc::new(
660680
strings
661681
.iter()
662682
.cloned()
@@ -891,8 +911,9 @@ mod tests {
891911
let strings1 = [Some("foo"), None, Some("bar")];
892912
let strings2 = [Some("blarg"), Some("blah"), None];
893913

894-
let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
895-
let dict_array = Arc::new(
914+
let string_array: ArrayRef =
915+
Arc::new(strings1.iter().cloned().collect::<StringArray>());
916+
let dict_array: ArrayRef = Arc::new(
896917
strings2
897918
.iter()
898919
.cloned()
@@ -925,23 +946,36 @@ mod tests {
925946

926947
#[test]
927948
fn test_create_hashes_from_arrays() {
928-
let int_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
929-
let float_array = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
949+
let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
950+
let float_array: ArrayRef =
951+
Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
930952

931953
let random_state = RandomState::with_seeds(0, 0, 0, 0);
932954
let hashes_buff = &mut vec![0; int_array.len()];
933-
let hashes = create_hashes_from_arrays(
934-
&[int_array.as_ref(), float_array.as_ref()],
935-
&random_state,
936-
hashes_buff,
937-
)
938-
.unwrap();
955+
let hashes =
956+
create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
939957
assert_eq!(hashes.len(), 4,);
940958
}
941959

960+
#[test]
961+
fn test_create_hashes_from_dyn_arrays() {
962+
let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
963+
let float_array: ArrayRef =
964+
Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
965+
966+
// Verify that we can call create_hashes with only &dyn Array
967+
fn test(arr1: &dyn Array, arr2: &dyn Array) {
968+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
969+
let hashes_buff = &mut vec![0; arr1.len()];
970+
let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
971+
assert_eq!(hashes.len(), 4,);
972+
}
973+
test(&*int_array, &*float_array);
974+
}
975+
942976
#[test]
943977
fn test_create_hashes_equivalence() {
944-
let array = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
978+
let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
945979
let random_state = RandomState::with_seeds(0, 0, 0, 0);
946980

947981
let mut hashes1 = vec![0; array.len()];
@@ -953,8 +987,7 @@ mod tests {
953987
.unwrap();
954988

955989
let mut hashes2 = vec![0; array.len()];
956-
create_hashes_from_arrays(&[array.as_ref()], &random_state, &mut hashes2)
957-
.unwrap();
990+
create_hashes([array], &random_state, &mut hashes2).unwrap();
958991

959992
assert_eq!(hashes1, hashes2);
960993
}

datafusion/common/src/scalar/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ use crate::cast::{
5151
};
5252
use crate::error::{DataFusionError, Result, _exec_err, _internal_err, _not_impl_err};
5353
use crate::format::DEFAULT_CAST_OPTIONS;
54-
use crate::hash_utils::create_hashes_from_arrays;
54+
use crate::hash_utils::create_hashes;
5555
use crate::utils::SingleRowListArrayBuilder;
5656
use crate::{_internal_datafusion_err, arrow_datafusion_err};
5757
use arrow::array::{
@@ -880,7 +880,7 @@ fn hash_nested_array<H: Hasher>(arr: ArrayRef, state: &mut H) {
880880
let len = arr.len();
881881
let hashes_buffer = &mut vec![0; len];
882882
let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0);
883-
let hashes = create_hashes_from_arrays(&[arr.as_ref()], &random_state, hashes_buffer)
883+
let hashes = create_hashes(&[arr], &random_state, hashes_buffer)
884884
.expect("hash_nested_array: failed to create row hashes");
885885
// Hash back to std::hash::Hasher
886886
hashes.hash(state);

datafusion/physical-expr-common/src/binary_map.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ use arrow::array::{
2727
};
2828
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
2929
use arrow::datatypes::DataType;
30-
use datafusion_common::hash_utils::create_hashes_from_arrays;
30+
use datafusion_common::hash_utils::create_hashes;
3131
use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
3232
use std::any::type_name;
3333
use std::fmt::Debug;
@@ -349,7 +349,7 @@ where
349349
let batch_hashes = &mut self.hashes_buffer;
350350
batch_hashes.clear();
351351
batch_hashes.resize(values.len(), 0);
352-
create_hashes_from_arrays(&[values.as_ref()], &self.random_state, batch_hashes)
352+
create_hashes([values], &self.random_state, batch_hashes)
353353
// hash is supported for all types and create_hashes only
354354
// returns errors for unsupported types
355355
.unwrap();

datafusion/physical-expr-common/src/binary_view_map.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,16 @@
1919
//! `StringViewArray`/`BinaryViewArray`.
2020
//! Much of the code is from `binary_map.rs`, but with simpler implementation because we directly use the
2121
//! [`GenericByteViewBuilder`].
22+
use crate::binary_map::OutputType;
2223
use ahash::RandomState;
2324
use arrow::array::cast::AsArray;
2425
use arrow::array::{Array, ArrayBuilder, ArrayRef, GenericByteViewBuilder};
2526
use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType};
26-
use datafusion_common::hash_utils::create_hashes_from_arrays;
27+
use datafusion_common::hash_utils::create_hashes;
2728
use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
2829
use std::fmt::Debug;
2930
use std::sync::Arc;
3031

31-
use crate::binary_map::OutputType;
32-
3332
/// HashSet optimized for storing string or binary values that can produce that
3433
/// the final set as a `GenericBinaryViewArray` with minimal copies.
3534
#[derive(Debug)]
@@ -243,7 +242,7 @@ where
243242
let batch_hashes = &mut self.hashes_buffer;
244243
batch_hashes.clear();
245244
batch_hashes.resize(values.len(), 0);
246-
create_hashes_from_arrays(&[values.as_ref()], &self.random_state, batch_hashes)
245+
create_hashes([values], &self.random_state, batch_hashes)
247246
// hash is supported for all types and create_hashes only
248247
// returns errors for unsupported types
249248
.unwrap();

datafusion/physical-plan/src/joins/hash_join/exec.rs

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,7 +1515,7 @@ mod tests {
15151515
use arrow::buffer::NullBuffer;
15161516
use arrow::datatypes::{DataType, Field};
15171517
use arrow_schema::Schema;
1518-
use datafusion_common::hash_utils::create_hashes_from_arrays;
1518+
use datafusion_common::hash_utils::create_hashes;
15191519
use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
15201520
use datafusion_common::{
15211521
assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err,
@@ -3454,11 +3454,7 @@ mod tests {
34543454

34553455
let random_state = RandomState::with_seeds(0, 0, 0, 0);
34563456
let hashes_buff = &mut vec![0; left.num_rows()];
3457-
let hashes = create_hashes_from_arrays(
3458-
&[left.columns()[0].as_ref()],
3459-
&random_state,
3460-
hashes_buff,
3461-
)?;
3457+
let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?;
34623458

34633459
// Maps both values to both indices (1 and 2, representing input 0 and 1)
34643460
// 0 -> (0, 1)
@@ -3487,11 +3483,7 @@ mod tests {
34873483
let right_keys_values =
34883484
key_column.evaluate(&right)?.into_array(right.num_rows())?;
34893485
let mut hashes_buffer = vec![0; right.num_rows()];
3490-
create_hashes_from_arrays(
3491-
&[right_keys_values.as_ref()],
3492-
&random_state,
3493-
&mut hashes_buffer,
3494-
)?;
3486+
create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?;
34953487

34963488
let (l, r, _) = lookup_join_hashmap(
34973489
&join_hash_map,
@@ -3525,11 +3517,7 @@ mod tests {
35253517

35263518
let random_state = RandomState::with_seeds(0, 0, 0, 0);
35273519
let hashes_buff = &mut vec![0; left.num_rows()];
3528-
let hashes = create_hashes_from_arrays(
3529-
&[left.columns()[0].as_ref()],
3530-
&random_state,
3531-
hashes_buff,
3532-
)?;
3520+
let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?;
35333521

35343522
hashmap_left.insert_unique(hashes[0], (hashes[0], 1u32), |(h, _)| *h);
35353523
hashmap_left.insert_unique(hashes[0], (hashes[0], 2u32), |(h, _)| *h);
@@ -3552,11 +3540,7 @@ mod tests {
35523540
let right_keys_values =
35533541
key_column.evaluate(&right)?.into_array(right.num_rows())?;
35543542
let mut hashes_buffer = vec![0; right.num_rows()];
3555-
create_hashes_from_arrays(
3556-
&[right_keys_values.as_ref()],
3557-
&random_state,
3558-
&mut hashes_buffer,
3559-
)?;
3543+
create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?;
35603544

35613545
let (l, r, _) = lookup_join_hashmap(
35623546
&join_hash_map,

0 commit comments

Comments
 (0)