diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1ff25db7..542b4cf4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -47,13 +47,13 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install or use cached foresterre/cargo-msrv - uses: baptiste0928/cargo-install@v1 - with: - crate: cargo-msrv + - name: Install foresterre/cargo-msrv without cache (takes longer, but caching produces unexpected behaviour) + run: cargo install cargo-msrv - - name: Verify the Rustc version declared in `cargo.toml` + - name: Verify the Rustc version declared in `cargo.toml` without cache (takes longer, but caching produces unexpected behaviour) run: | + rm -f Cargo.lock + cargo update cargo-msrv verify # github actions does not support big endian systems directly, but it does support QEMU. @@ -82,13 +82,11 @@ jobs: run: sudo systemctl start docker - name: Cross-Compile project to mips-unknown-linux-gnu - run: | - cross build --target=mips-unknown-linux-gnu --verbose + run: cross build --target=mips-unknown-linux-gnu --verbose # https://github.com/cross-rs/cross#supported-targets - name: Cross-Run Tests in mips-unknown-linux-gnu using Qemu - run: | - cross test --target mips-unknown-linux-gnu --verbose + run: cross test --target mips-unknown-linux-gnu --verbose wasm32: runs-on: ubuntu-latest @@ -109,3 +107,4 @@ jobs: - name: Run tests without default features run: cargo test --verbose --no-default-features + diff --git a/Cargo.toml b/Cargo.toml index 4d2adec4..51103f3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ proc-macro = false [dependencies] lebe = "^0.5.2" # generic binary serialization -half = ">=2.1.0, <2.3" # 16 bit float pixel data type +half = ">=2.1.0, <2.3" # 16 bit float pixel data type bit_field = "^0.10.1" # exr file version bit flags miniz_oxide = "^0.7.1" # zip compression for pxr24 smallvec = "^1.7.0" # make cache-friendly allocations TODO profile if smallvec is really an improvement! diff --git a/benches/pixel_format_conversion.rs b/benches/pixel_format_conversion.rs index 705963cc..3d1ba930 100644 --- a/benches/pixel_format_conversion.rs +++ b/benches/pixel_format_conversion.rs @@ -8,62 +8,112 @@ use bencher::Bencher; use std::fs; use std::io::Cursor; use exr::image::pixel_vec::PixelVec; +use exr::io::Data; +use exr::block::samples::FromNativeSample; + +const F32_ZIPS_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zips.exr"; +const F32_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed.exr"; +const F16_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed_half.exr"; +const F16_ZIP_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zip_half.exr"; /// Read an image from an in-memory buffer into its native f32 format -fn read_image_rgba_f32_to_f32(bench: &mut Bencher) { - let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap(); - bencher::black_box(&mut file); +fn read_f32_as_f32_uncompressed_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_UNCOMPRESSED_PATH, false); +} - bench.iter(||{ - let image = exr::prelude::read() - .no_deep_data().largest_resolution_level() - .rgba_channels(PixelVec::<(f32,f32,f32,f32)>::constructor, PixelVec::set_pixel) - .all_layers().all_attributes() - .non_parallel() - .from_buffered(Cursor::new(file.as_slice())).unwrap(); +/// Read image and convert the samples to u32 (from native f32) +fn read_f32_as_u32_uncompressed_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_UNCOMPRESSED_PATH, false); +} - bencher::black_box(image); - }) +/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls +fn read_f32_as_f16_uncompressed_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_UNCOMPRESSED_PATH, false); } -/// Read image and convert the samples to u32 (from native f32) -fn read_image_rgba_f32_to_u32(bench: &mut Bencher) { - let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap(); - bencher::black_box(&mut file); +fn read_f16_as_f16_uncompressed_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_UNCOMPRESSED_PATH, false); +} - bench.iter(||{ - let image = exr::prelude::read() - .no_deep_data().largest_resolution_level() - .rgba_channels(PixelVec::<(u32,u32,u32,u32)>::constructor, PixelVec::set_pixel) - .all_layers().all_attributes() - .non_parallel() - .from_buffered(Cursor::new(file.as_slice())).unwrap(); +fn read_f16_as_f32_uncompressed_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_UNCOMPRESSED_PATH, false); +} - bencher::black_box(image); - }) +fn read_f16_as_u32_uncompressed_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_UNCOMPRESSED_PATH, false); } -/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls -fn read_image_rgba_f32_to_f16(bench: &mut Bencher) { - let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap(); + +fn read_f32_as_f16_zips_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_ZIPS_PATH, false); +} + +fn read_f16_as_f32_zip_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_ZIP_PATH, false); +} + +fn read_f32_as_f16_zips_nthreads(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_ZIPS_PATH, true); +} + +fn read_f16_as_f32_zip_nthreads(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_ZIP_PATH, true); +} + +fn read_f32_as_f32_zips_nthreads(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_ZIPS_PATH, true); +} + +fn read_f16_as_f16_zip_nthreads(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_ZIP_PATH, true); +} + +fn read_f32_as_f32_zips_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F32_ZIPS_PATH, false); +} + +fn read_f16_as_f16_zip_1thread(bench: &mut Bencher) { + bench_read_image_rgba_as::(bench, F16_ZIP_PATH, false); +} + +fn bench_read_image_rgba_as(bench: &mut Bencher, path: &str, parallel: bool) { + let mut file = fs::read(path).unwrap(); bencher::black_box(&mut file); bench.iter(||{ - let image = exr::prelude::read() - .no_deep_data().largest_resolution_level() - .rgba_channels(PixelVec::<(f16,f16,f16,f16)>::constructor, PixelVec::set_pixel) - .all_layers().all_attributes() - .non_parallel() - .from_buffered(Cursor::new(file.as_slice())).unwrap(); - + let image = read_file_from_memory_as::(file.as_slice(), parallel); bencher::black_box(image); }) } +fn read_file_from_memory_as(file: &[u8], parallel: bool) -> RgbaImage> + where T: FromNativeSample +{ + let read = exr::prelude::read() + .no_deep_data().largest_resolution_level() + .rgba_channels(PixelVec::<(T, T, T, T)>::constructor, PixelVec::set_pixel) + .first_valid_layer().all_attributes(); + + let read = if parallel { read } else { read.non_parallel() }; + read.from_buffered(Cursor::new(file)).unwrap() +} + benchmark_group!(pixel_format_conversion, - read_image_rgba_f32_to_f32, - read_image_rgba_f32_to_u32, - read_image_rgba_f32_to_f16, + read_f32_as_f32_uncompressed_1thread, + read_f32_as_u32_uncompressed_1thread, + read_f32_as_f16_uncompressed_1thread, + read_f32_as_f16_zips_1thread, + read_f32_as_f16_zips_nthreads, + read_f32_as_f32_zips_nthreads, + read_f32_as_f32_zips_1thread, + + read_f16_as_f16_uncompressed_1thread, + read_f16_as_u32_uncompressed_1thread, + read_f16_as_f32_uncompressed_1thread, + read_f16_as_f32_zip_1thread, + read_f16_as_f32_zip_nthreads, + read_f16_as_f16_zip_nthreads, + read_f16_as_f16_zip_1thread, ); benchmark_main!(pixel_format_conversion); \ No newline at end of file diff --git a/src/block/samples.rs b/src/block/samples.rs index 90485fd2..4352b111 100644 --- a/src/block/samples.rs +++ b/src/block/samples.rs @@ -1,6 +1,7 @@ //! Extract pixel samples from a block of pixel bytes. use crate::prelude::*; +use half::prelude::HalfFloatSliceExt; /// A single red, green, blue, or alpha value. @@ -112,6 +113,7 @@ impl From for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() } /// Create an arbitrary sample type from one of the defined sample types. /// Should be compiled to a no-op where the file contains the predicted sample type. +/// The slice functions should be optimized into a `memcpy` where there is no conversion needed. pub trait FromNativeSample: Sized + Copy + Default + 'static { /// Create this sample from a f16, trying to represent the same numerical value @@ -122,31 +124,85 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static { /// Create this sample from a u32, trying to represent the same numerical value fn from_u32(value: u32) -> Self; + + /// Convert all values from the slice into this type. + /// This function exists to allow the compiler to perform a vectorization optimization. + /// Note that this default implementation will **not** be vectorized by the compiler automatically. + /// For maximum performance you will need to override this function and implement it via + /// an explicit batched conversion such as [`convert_to_f32_slice`](https://docs.rs/half/2.3.1/half/slice/trait.HalfFloatSliceExt.html#tymethod.convert_to_f32_slice) + #[inline] + fn from_f16s(from: &[f16], to: &mut [Self]) { + assert_eq!(from.len(), to.len(), "slices must have the same length"); + for (from, to) in from.iter().zip(to.iter_mut()) { + *to = Self::from_f16(*from); + } + } + + /// Convert all values from the slice into this type. + /// This function exists to allow the compiler to perform a vectorization optimization. + /// Note that this default implementation will be vectorized by the compiler automatically. + #[inline] + fn from_f32s(from: &[f32], to: &mut [Self]) { + assert_eq!(from.len(), to.len(), "slices must have the same length"); + for (from, to) in from.iter().zip(to.iter_mut()) { + *to = Self::from_f32(*from); + } + } + + /// Convert all values from the slice into this type. + /// This function exists to allow the compiler to perform a vectorization optimization. + /// Note that this default implementation will be vectorized by the compiler automatically, + /// provided that the CPU supports the necessary conversion instructions. + /// For example, x86_64 lacks the instructions to convert `u32` to floats, + /// so this will inevitably be slow on x86_64. + #[inline] + fn from_u32s(from: &[u32], to: &mut [Self]) { + assert_eq!(from.len(), to.len(), "slices must have the same length"); + for (from, to) in from.iter().zip(to.iter_mut()) { + *to = Self::from_u32(*from); + } + } } // TODO haven't i implemented this exact behaviour already somewhere else in this library...?? impl FromNativeSample for f32 { - fn from_f16(value: f16) -> Self { value.to_f32() } - fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output - fn from_u32(value: u32) -> Self { value as f32 } + #[inline] fn from_f16(value: f16) -> Self { value.to_f32() } + #[inline] fn from_f32(value: f32) -> Self { value } + #[inline] fn from_u32(value: u32) -> Self { value as f32 } + + // f16 is a custom type + // so the compiler can not automatically vectorize the conversion + // that's why we need to specialize this function + #[inline] + fn from_f16s(from: &[f16], to: &mut [Self]) { + from.convert_to_f32_slice(to); + } } impl FromNativeSample for u32 { - fn from_f16(value: f16) -> Self { value.to_f32() as u32 } - fn from_f32(value: f32) -> Self { value as u32 } - fn from_u32(value: u32) -> Self { value } + #[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 } + #[inline] fn from_f32(value: f32) -> Self { value as u32 } + #[inline] fn from_u32(value: u32) -> Self { value } } impl FromNativeSample for f16 { - fn from_f16(value: f16) -> Self { value } - fn from_f32(value: f32) -> Self { f16::from_f32(value) } - fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) } + #[inline] fn from_f16(value: f16) -> Self { value } + #[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) } + #[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) } + + // f16 is a custom type + // so the compiler can not automatically vectorize the conversion + // that's why we need to specialize this function + #[inline] + fn from_f32s(from: &[f32], to: &mut [Self]) { + to.convert_from_f32_slice(from) + } } impl FromNativeSample for Sample { - fn from_f16(value: f16) -> Self { Self::from(value) } - fn from_f32(value: f32) -> Self { Self::from(value) } - fn from_u32(value: u32) -> Self { Self::from(value) } + #[inline] fn from_f16(value: f16) -> Self { Self::from(value) } + #[inline] fn from_f32(value: f32) -> Self { Self::from(value) } + #[inline] fn from_u32(value: u32) -> Self { Self::from(value) } } diff --git a/src/image/read/specific_channels.rs b/src/image/read/specific_channels.rs index cc7f1abc..375691c4 100644 --- a/src/image/read/specific_channels.rs +++ b/src/image/read/specific_channels.rs @@ -12,6 +12,7 @@ use crate::image::read::layers::{ChannelsReader, ReadChannels}; use crate::block::chunk::TileCoordinates; use std::marker::PhantomData; +use crate::io::Read; /// Can be attached one more channel reader. @@ -279,30 +280,121 @@ pub struct OptionalSampleReader { impl SampleReader { fn read_own_samples<'s, FullPixel>( &self, bytes: &'s[u8], pixels: &mut [FullPixel], - get_pixel: impl Fn(&mut FullPixel) -> &mut Sample + get_sample: impl Fn(&mut FullPixel) -> &mut Sample ){ let start_index = pixels.len() * self.channel_byte_offset; let byte_count = pixels.len() * self.channel.sample_type.bytes_per_sample(); - let mut own_bytes_reader = &bytes[start_index .. start_index + byte_count]; // TODO check block size somewhere + let mut own_bytes_reader = &mut &bytes[start_index .. start_index + byte_count]; // TODO check block size somewhere + let mut samples_out = pixels.iter_mut().map(|pixel| get_sample(pixel)); - let error_msg = "error when reading from in-memory slice"; - - // match outside the loop to avoid matching on every single sample + // match the type once for the whole line, not on every single sample match self.channel.sample_type { - SampleType::F16 => for pixel in pixels.iter_mut() { - *get_pixel(pixel) = Sample::from_f16(f16::read(&mut own_bytes_reader).expect(error_msg)); - }, + SampleType::F16 => read_and_convert_all_samples_batched( + &mut own_bytes_reader, &mut samples_out, + Sample::from_f16s + ), + + SampleType::F32 => read_and_convert_all_samples_batched( + &mut own_bytes_reader, &mut samples_out, + Sample::from_f32s + ), + + SampleType::U32 => read_and_convert_all_samples_batched( + &mut own_bytes_reader, &mut samples_out, + Sample::from_u32s + ), + } + + debug_assert!(samples_out.next().is_none(), "not all samples have been converted"); + debug_assert!(own_bytes_reader.is_empty(), "bytes left after reading all samples"); + } +} - SampleType::F32 => for pixel in pixels.iter_mut() { - *get_pixel(pixel) = Sample::from_f32(f32::read(&mut own_bytes_reader).expect(error_msg)); - }, - SampleType::U32 => for pixel in pixels.iter_mut() { - *get_pixel(pixel) = Sample::from_u32(u32::read(&mut own_bytes_reader).expect(error_msg)); - }, +/// Does the same as `convert_batch(in_bytes.chunks().map(From::from_bytes))`, but vectorized. +/// Reads the samples for one line, using the sample type specified in the file, +/// and then converts those to the desired sample types. +/// Uses batches to allow vectorization, converting multiple values with one instruction. +fn read_and_convert_all_samples_batched<'t, From, To>( + mut in_bytes: impl Read, + out_samples: &mut impl ExactSizeIterator, + convert_batch: fn(&[From], &mut [To]) +) where From: Data + Default + Copy, To: 't + Default + Copy +{ + // this is not a global! why is this warning triggered? + #[allow(non_upper_case_globals)] + const batch_size: usize = 16; + + let total_sample_count = out_samples.len(); + let batch_count = total_sample_count / batch_size; + let remaining_samples_count = total_sample_count % batch_size; + + let len_error_msg = "sample count was miscalculated"; + let byte_error_msg = "error when reading from in-memory slice"; + + // write samples from a given slice to the output iterator. should be inlined. + let output_n_samples = &mut move |samples: &[To]| { + for converted_sample in samples { + *out_samples.next().expect(len_error_msg) = *converted_sample; } + }; + + // read samples from the byte source into a given slice. should be inlined. + // todo: use #[inline] when available + // error[E0658]: attributes on expressions are experimental, + // see issue #15701 for more information + let read_n_samples = &mut move |samples: &mut [From]| { + Data::read_slice(&mut in_bytes, samples).expect(byte_error_msg); + }; + + // temporary arrays with fixed size, operations should be vectorized within these arrays + let mut source_samples_batch: [From; batch_size] = Default::default(); + let mut desired_samples_batch: [To; batch_size] = Default::default(); + + // first convert all whole batches, size statically known to be 16 element arrays + for _ in 0 .. batch_count { + read_n_samples(&mut source_samples_batch); + convert_batch(source_samples_batch.as_slice(), desired_samples_batch.as_mut_slice()); + output_n_samples(&desired_samples_batch); + } - debug_assert!(own_bytes_reader.is_empty(), "bytes left after reading all samples"); + // then convert a partial remaining batch, size known only at runtime + if remaining_samples_count != 0 { + let source_samples_batch = &mut source_samples_batch[..remaining_samples_count]; + let desired_samples_batch = &mut desired_samples_batch[..remaining_samples_count]; + + read_n_samples(source_samples_batch); + convert_batch(source_samples_batch, desired_samples_batch); + output_n_samples(desired_samples_batch); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn equals_naive_f32(){ + for total_array_size in [3, 7, 30, 41, 120, 10_423] { + let input_f32s = (0..total_array_size).map(|_| rand::random::()).collect::>(); + let in_f32s_bytes = input_f32s.iter().cloned().flat_map(f32::to_le_bytes).collect::>(); + + let mut out_f16_samples_batched = vec![ + f16::from_f32(rand::random::()); + total_array_size + ]; + + read_and_convert_all_samples_batched( + &mut in_f32s_bytes.as_slice(), + &mut out_f16_samples_batched.iter_mut(), + f16::from_f32s + ); + + let out_f16_samples_naive = input_f32s.iter() + .cloned().map(f16::from_f32); + + assert!(out_f16_samples_naive.eq(out_f16_samples_batched)); + } } } diff --git a/src/math.rs b/src/math.rs index 50b4c853..9f21bf1a 100644 --- a/src/math.rs +++ b/src/math.rs @@ -194,9 +194,15 @@ impl RoundingMode { } } + /// Only works for positive numbers. pub(crate) fn divide(self, dividend: T, divisor: T) -> T - where T: Copy + Add + Sub + Div + From + where T: Copy + Add + Sub + Div + From + std::cmp::PartialOrd { + assert!( + dividend >= T::from(0) && divisor >= T::from(1), + "division with rounding up only works for positive numbers" + ); + match self { RoundingMode::Up => (dividend + divisor - T::from(1_u8)) / divisor, // only works for positive numbers RoundingMode::Down => dividend / divisor, diff --git a/tests/images/valid/custom/crowskull/crow_uncompressed_half.exr b/tests/images/valid/custom/crowskull/crow_uncompressed_half.exr new file mode 100644 index 00000000..3b9257f2 Binary files /dev/null and b/tests/images/valid/custom/crowskull/crow_uncompressed_half.exr differ