Skip to content
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions arrow-buffer/src/buffer/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,75 @@ impl MutableBuffer {
}
}

/// Adding to this mutable buffer `slice_to_repeat` repeated `repeat_count` times.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering how much the unsafe log copying here makes a difference, vs ensuring reserve is called correctly.

Did you measure with code that was like:

reserve(slice.len() * repeat_count);
for _ in 0..repeat_count {
  buf.extend_from_slice(slice_to_repeat)
}

///
/// # Example
///
/// ## Repeat the same string bytes multiple times
/// ```
/// # use arrow_buffer::buffer::MutableBuffer;
/// let mut buffer = MutableBuffer::new(0);
/// let bytes_to_repeat = b"ab";
/// buffer.repeat_slice_n_times(bytes_to_repeat, 3);
/// assert_eq!(buffer.as_slice(), b"ababab");
/// ```
pub fn repeat_slice_n_times<T: ArrowNativeType>(
&mut self,
slice_to_repeat: &[T],
repeat_count: usize,
) {
if repeat_count == 0 || slice_to_repeat.is_empty() {
return;
}

let bytes_to_repeat = size_of_val(slice_to_repeat);

// Ensure capacity
self.reserve(repeat_count * bytes_to_repeat);

// Save the length before we do all the copies to know where to start from
let length_before = self.len;

// Copy the initial slice once so we can use doubling strategy on it
self.extend_from_slice(slice_to_repeat);

// This tracks how much bytes we have added by repeating so far
let added_repeats_length = bytes_to_repeat;
assert_eq!(
self.len - length_before,
added_repeats_length,
"should copy exactly the same number of bytes"
);

// Number of times the slice was repeated
let mut already_repeated_times = 1;

// We will use doubling strategy to fill the buffer in log(repeat_count) steps
while already_repeated_times < repeat_count {
// How many slices can we copy in this iteration
// (either double what we have, or just the remaining ones)
let number_of_slices_to_copy =
already_repeated_times.min(repeat_count - already_repeated_times);
let number_of_bytes_to_copy = number_of_slices_to_copy * bytes_to_repeat;

unsafe {
// Get to the start of the data before we started copying anything
let src = self.data.as_ptr().add(length_before) as *const u8;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rustc can probably figure it out, but src is the same for all loop iterations so could be pulled out of the loop I think

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I thought about it but decided not to do it so the code for src and dst is close


// Go to the current location to copy to (end of current data)
let dst = self.data.as_ptr().add(self.len);

// SAFETY: the pointers are not overlapping as there is `number_of_bytes_to_copy` or less between them
std::ptr::copy_nonoverlapping(src, dst, number_of_bytes_to_copy)
}

// Advance the length by the amount of data we just copied (doubled)
self.len += number_of_bytes_to_copy;

already_repeated_times += number_of_slices_to_copy;
}
}

#[cold]
fn reallocate(&mut self, capacity: usize) {
let new_layout = Layout::from_size_align(capacity, self.layout.align()).unwrap();
Expand Down Expand Up @@ -1184,4 +1253,125 @@ mod tests {
assert_eq!(pool.used(), 0);
}
}

fn create_expected_repeated_slice<T: ArrowNativeType>(
slice_to_repeat: &[T],
repeat_count: usize,
) -> Buffer {
let mut expected = MutableBuffer::new(size_of_val(slice_to_repeat) * repeat_count);
for _ in 0..repeat_count {
// Not using push_slice_repeated as this is the function under test
expected.extend_from_slice(slice_to_repeat);
}
expected.into()
}

// Helper to test a specific repeat count with various slice sizes
fn test_repeat_count<T: ArrowNativeType + PartialEq + std::fmt::Debug>(
repeat_count: usize,
test_data: &[T],
) {
let mut buffer = MutableBuffer::new(0);
buffer.repeat_slice_n_times(test_data, repeat_count);

let expected = create_expected_repeated_slice(test_data, repeat_count);
let result: Buffer = buffer.into();

assert_eq!(
result,
expected,
"Failed for repeat_count={}, slice_len={}",
repeat_count,
test_data.len()
);
}

#[test]
fn test_repeat_slice_count_edge_cases() {
// Empty slice
test_repeat_count(100, &[] as &[i32]);

// Zero repeats
test_repeat_count(0, &[1i32, 2, 3]);
}

#[test]
fn test_small_repeats_counts() {
// test any special implementation for small repeat counts
let data = &[1u8, 2, 3, 4, 5];

for _ in 1..=10 {
test_repeat_count(2, data);
}
}

#[test]
fn test_different_size_of_i32_repeat_slice() {
let data: &[i32] = &[1, 2, 3];
let data_with_single_item: &[i32] = &[42];

for data in &[data, data_with_single_item] {
for item in 1..=9 {
let base_repeat_count = 2_usize.pow(item);
test_repeat_count(base_repeat_count - 1, data);
test_repeat_count(base_repeat_count, data);
test_repeat_count(base_repeat_count + 1, data);
}
}
}

#[test]
fn test_different_size_of_u8_repeat_slice() {
let data: &[u8] = &[1, 2, 3];
let data_with_single_item: &[u8] = &[10];

for data in &[data, data_with_single_item] {
for item in 1..=9 {
let base_repeat_count = 2_usize.pow(item);
test_repeat_count(base_repeat_count - 1, data);
test_repeat_count(base_repeat_count, data);
test_repeat_count(base_repeat_count + 1, data);
}
}
}

#[test]
fn test_different_size_of_u16_repeat_slice() {
let data: &[u16] = &[1, 2, 3];
let data_with_single_item: &[u16] = &[10];

for data in &[data, data_with_single_item] {
for item in 1..=9 {
let base_repeat_count = 2_usize.pow(item);
test_repeat_count(base_repeat_count - 1, data);
test_repeat_count(base_repeat_count, data);
test_repeat_count(base_repeat_count + 1, data);
}
}
}

#[test]
fn test_various_slice_lengths() {
// Test different slice lengths with same repeat pattern
let repeat_count = 37; // Arbitrary non-power-of-2

// Single element
test_repeat_count(repeat_count, &[42i32]);

// Small slices
test_repeat_count(repeat_count, &[1i32, 2]);
test_repeat_count(repeat_count, &[1i32, 2, 3]);
test_repeat_count(repeat_count, &[1i32, 2, 3, 4]);
test_repeat_count(repeat_count, &[1i32, 2, 3, 4, 5]);

// Larger slices
let data_10: Vec<i32> = (0..10).collect();
test_repeat_count(repeat_count, &data_10);

let data_100: Vec<i32> = (0..100).collect();
test_repeat_count(repeat_count, &data_100);

let data_1000: Vec<i32> = (0..1000).collect();
test_repeat_count(repeat_count, &data_1000);
}
}
Loading