Skip to content

Commit 3e0f6cd

Browse files
Merge pull request #191 from johannesvollmer/f16_batch_conversion
batched f16 conversion
2 parents 0cb958c + 85a311d commit 3e0f6cd

File tree

7 files changed

+279
-76
lines changed

7 files changed

+279
-76
lines changed

.github/workflows/rust.yml

+8-9
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,13 @@ jobs:
4747
steps:
4848
- uses: actions/checkout@v2
4949

50-
- name: Install or use cached foresterre/cargo-msrv
51-
uses: baptiste0928/cargo-install@v1
52-
with:
53-
crate: cargo-msrv
50+
- name: Install foresterre/cargo-msrv without cache (takes longer, but caching produces unexpected behaviour)
51+
run: cargo install cargo-msrv
5452

55-
- name: Verify the Rustc version declared in `cargo.toml`
53+
- name: Verify the Rustc version declared in `cargo.toml` without cache (takes longer, but caching produces unexpected behaviour)
5654
run: |
55+
rm -f Cargo.lock
56+
cargo update
5757
cargo-msrv verify
5858
5959
# github actions does not support big endian systems directly, but it does support QEMU.
@@ -82,13 +82,11 @@ jobs:
8282
run: sudo systemctl start docker
8383

8484
- name: Cross-Compile project to mips-unknown-linux-gnu
85-
run: |
86-
cross build --target=mips-unknown-linux-gnu --verbose
85+
run: cross build --target=mips-unknown-linux-gnu --verbose
8786

8887
# https://github.com/cross-rs/cross#supported-targets
8988
- name: Cross-Run Tests in mips-unknown-linux-gnu using Qemu
90-
run: |
91-
cross test --target mips-unknown-linux-gnu --verbose
89+
run: cross test --target mips-unknown-linux-gnu --verbose
9290

9391
wasm32:
9492
runs-on: ubuntu-latest
@@ -109,3 +107,4 @@ jobs:
109107

110108
- name: Run tests without default features
111109
run: cargo test --verbose --no-default-features
110+

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ proc-macro = false
2828

2929
[dependencies]
3030
lebe = "^0.5.2" # generic binary serialization
31-
half = ">=2.1.0, <2.3" # 16 bit float pixel data type
31+
half = ">=2.1.0, <2.3" # 16 bit float pixel data type
3232
bit_field = "^0.10.1" # exr file version bit flags
3333
miniz_oxide = "^0.7.1" # zip compression for pxr24
3434
smallvec = "^1.7.0" # make cache-friendly allocations TODO profile if smallvec is really an improvement!

benches/pixel_format_conversion.rs

+88-38
Original file line numberDiff line numberDiff line change
@@ -8,62 +8,112 @@ use bencher::Bencher;
88
use std::fs;
99
use std::io::Cursor;
1010
use exr::image::pixel_vec::PixelVec;
11+
use exr::io::Data;
12+
use exr::block::samples::FromNativeSample;
13+
14+
const F32_ZIPS_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zips.exr";
15+
const F32_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed.exr";
16+
const F16_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed_half.exr";
17+
const F16_ZIP_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zip_half.exr";
1118

1219
/// Read an image from an in-memory buffer into its native f32 format
13-
fn read_image_rgba_f32_to_f32(bench: &mut Bencher) {
14-
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
15-
bencher::black_box(&mut file);
20+
fn read_f32_as_f32_uncompressed_1thread(bench: &mut Bencher) {
21+
bench_read_image_rgba_as::<f32>(bench, F32_UNCOMPRESSED_PATH, false);
22+
}
1623

17-
bench.iter(||{
18-
let image = exr::prelude::read()
19-
.no_deep_data().largest_resolution_level()
20-
.rgba_channels(PixelVec::<(f32,f32,f32,f32)>::constructor, PixelVec::set_pixel)
21-
.all_layers().all_attributes()
22-
.non_parallel()
23-
.from_buffered(Cursor::new(file.as_slice())).unwrap();
24+
/// Read image and convert the samples to u32 (from native f32)
25+
fn read_f32_as_u32_uncompressed_1thread(bench: &mut Bencher) {
26+
bench_read_image_rgba_as::<u32>(bench, F32_UNCOMPRESSED_PATH, false);
27+
}
2428

25-
bencher::black_box(image);
26-
})
29+
/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
30+
fn read_f32_as_f16_uncompressed_1thread(bench: &mut Bencher) {
31+
bench_read_image_rgba_as::<f16>(bench, F32_UNCOMPRESSED_PATH, false);
2732
}
2833

29-
/// Read image and convert the samples to u32 (from native f32)
30-
fn read_image_rgba_f32_to_u32(bench: &mut Bencher) {
31-
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
32-
bencher::black_box(&mut file);
34+
fn read_f16_as_f16_uncompressed_1thread(bench: &mut Bencher) {
35+
bench_read_image_rgba_as::<f16>(bench, F16_UNCOMPRESSED_PATH, false);
36+
}
3337

34-
bench.iter(||{
35-
let image = exr::prelude::read()
36-
.no_deep_data().largest_resolution_level()
37-
.rgba_channels(PixelVec::<(u32,u32,u32,u32)>::constructor, PixelVec::set_pixel)
38-
.all_layers().all_attributes()
39-
.non_parallel()
40-
.from_buffered(Cursor::new(file.as_slice())).unwrap();
38+
fn read_f16_as_f32_uncompressed_1thread(bench: &mut Bencher) {
39+
bench_read_image_rgba_as::<f32>(bench, F16_UNCOMPRESSED_PATH, false);
40+
}
4141

42-
bencher::black_box(image);
43-
})
42+
fn read_f16_as_u32_uncompressed_1thread(bench: &mut Bencher) {
43+
bench_read_image_rgba_as::<u32>(bench, F16_UNCOMPRESSED_PATH, false);
4444
}
4545

46-
/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
47-
fn read_image_rgba_f32_to_f16(bench: &mut Bencher) {
48-
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
46+
47+
fn read_f32_as_f16_zips_1thread(bench: &mut Bencher) {
48+
bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, false);
49+
}
50+
51+
fn read_f16_as_f32_zip_1thread(bench: &mut Bencher) {
52+
bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, false);
53+
}
54+
55+
fn read_f32_as_f16_zips_nthreads(bench: &mut Bencher) {
56+
bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, true);
57+
}
58+
59+
fn read_f16_as_f32_zip_nthreads(bench: &mut Bencher) {
60+
bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, true);
61+
}
62+
63+
fn read_f32_as_f32_zips_nthreads(bench: &mut Bencher) {
64+
bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, true);
65+
}
66+
67+
fn read_f16_as_f16_zip_nthreads(bench: &mut Bencher) {
68+
bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, true);
69+
}
70+
71+
fn read_f32_as_f32_zips_1thread(bench: &mut Bencher) {
72+
bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, false);
73+
}
74+
75+
fn read_f16_as_f16_zip_1thread(bench: &mut Bencher) {
76+
bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, false);
77+
}
78+
79+
fn bench_read_image_rgba_as<T>(bench: &mut Bencher, path: &str, parallel: bool) {
80+
let mut file = fs::read(path).unwrap();
4981
bencher::black_box(&mut file);
5082

5183
bench.iter(||{
52-
let image = exr::prelude::read()
53-
.no_deep_data().largest_resolution_level()
54-
.rgba_channels(PixelVec::<(f16,f16,f16,f16)>::constructor, PixelVec::set_pixel)
55-
.all_layers().all_attributes()
56-
.non_parallel()
57-
.from_buffered(Cursor::new(file.as_slice())).unwrap();
58-
84+
let image = read_file_from_memory_as::<f16>(file.as_slice(), parallel);
5985
bencher::black_box(image);
6086
})
6187
}
6288

89+
fn read_file_from_memory_as<T>(file: &[u8], parallel: bool) -> RgbaImage<PixelVec<(T, T, T, T)>>
90+
where T: FromNativeSample
91+
{
92+
let read = exr::prelude::read()
93+
.no_deep_data().largest_resolution_level()
94+
.rgba_channels(PixelVec::<(T, T, T, T)>::constructor, PixelVec::set_pixel)
95+
.first_valid_layer().all_attributes();
96+
97+
let read = if parallel { read } else { read.non_parallel() };
98+
read.from_buffered(Cursor::new(file)).unwrap()
99+
}
100+
63101
benchmark_group!(pixel_format_conversion,
64-
read_image_rgba_f32_to_f32,
65-
read_image_rgba_f32_to_u32,
66-
read_image_rgba_f32_to_f16,
102+
read_f32_as_f32_uncompressed_1thread,
103+
read_f32_as_u32_uncompressed_1thread,
104+
read_f32_as_f16_uncompressed_1thread,
105+
read_f32_as_f16_zips_1thread,
106+
read_f32_as_f16_zips_nthreads,
107+
read_f32_as_f32_zips_nthreads,
108+
read_f32_as_f32_zips_1thread,
109+
110+
read_f16_as_f16_uncompressed_1thread,
111+
read_f16_as_u32_uncompressed_1thread,
112+
read_f16_as_f32_uncompressed_1thread,
113+
read_f16_as_f32_zip_1thread,
114+
read_f16_as_f32_zip_nthreads,
115+
read_f16_as_f16_zip_nthreads,
116+
read_f16_as_f16_zip_1thread,
67117
);
68118

69119
benchmark_main!(pixel_format_conversion);

src/block/samples.rs

+68-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! Extract pixel samples from a block of pixel bytes.
22
33
use crate::prelude::*;
4+
use half::prelude::HalfFloatSliceExt;
45

56

67
/// A single red, green, blue, or alpha value.
@@ -112,6 +113,7 @@ impl From<Sample> for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() }
112113

113114
/// Create an arbitrary sample type from one of the defined sample types.
114115
/// Should be compiled to a no-op where the file contains the predicted sample type.
116+
/// The slice functions should be optimized into a `memcpy` where there is no conversion needed.
115117
pub trait FromNativeSample: Sized + Copy + Default + 'static {
116118

117119
/// Create this sample from a f16, trying to represent the same numerical value
@@ -122,31 +124,85 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static {
122124

123125
/// Create this sample from a u32, trying to represent the same numerical value
124126
fn from_u32(value: u32) -> Self;
127+
128+
/// Convert all values from the slice into this type.
129+
/// This function exists to allow the compiler to perform a vectorization optimization.
130+
/// Note that this default implementation will **not** be vectorized by the compiler automatically.
131+
/// For maximum performance you will need to override this function and implement it via
132+
/// an explicit batched conversion such as [`convert_to_f32_slice`](https://docs.rs/half/2.3.1/half/slice/trait.HalfFloatSliceExt.html#tymethod.convert_to_f32_slice)
133+
#[inline]
134+
fn from_f16s(from: &[f16], to: &mut [Self]) {
135+
assert_eq!(from.len(), to.len(), "slices must have the same length");
136+
for (from, to) in from.iter().zip(to.iter_mut()) {
137+
*to = Self::from_f16(*from);
138+
}
139+
}
140+
141+
/// Convert all values from the slice into this type.
142+
/// This function exists to allow the compiler to perform a vectorization optimization.
143+
/// Note that this default implementation will be vectorized by the compiler automatically.
144+
#[inline]
145+
fn from_f32s(from: &[f32], to: &mut [Self]) {
146+
assert_eq!(from.len(), to.len(), "slices must have the same length");
147+
for (from, to) in from.iter().zip(to.iter_mut()) {
148+
*to = Self::from_f32(*from);
149+
}
150+
}
151+
152+
/// Convert all values from the slice into this type.
153+
/// This function exists to allow the compiler to perform a vectorization optimization.
154+
/// Note that this default implementation will be vectorized by the compiler automatically,
155+
/// provided that the CPU supports the necessary conversion instructions.
156+
/// For example, x86_64 lacks the instructions to convert `u32` to floats,
157+
/// so this will inevitably be slow on x86_64.
158+
#[inline]
159+
fn from_u32s(from: &[u32], to: &mut [Self]) {
160+
assert_eq!(from.len(), to.len(), "slices must have the same length");
161+
for (from, to) in from.iter().zip(to.iter_mut()) {
162+
*to = Self::from_u32(*from);
163+
}
164+
}
125165
}
126166

127167
// TODO haven't i implemented this exact behaviour already somewhere else in this library...??
128168
impl FromNativeSample for f32 {
129-
fn from_f16(value: f16) -> Self { value.to_f32() }
130-
fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output
131-
fn from_u32(value: u32) -> Self { value as f32 }
169+
#[inline] fn from_f16(value: f16) -> Self { value.to_f32() }
170+
#[inline] fn from_f32(value: f32) -> Self { value }
171+
#[inline] fn from_u32(value: u32) -> Self { value as f32 }
172+
173+
// f16 is a custom type
174+
// so the compiler can not automatically vectorize the conversion
175+
// that's why we need to specialize this function
176+
#[inline]
177+
fn from_f16s(from: &[f16], to: &mut [Self]) {
178+
from.convert_to_f32_slice(to);
179+
}
132180
}
133181

134182
impl FromNativeSample for u32 {
135-
fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
136-
fn from_f32(value: f32) -> Self { value as u32 }
137-
fn from_u32(value: u32) -> Self { value }
183+
#[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
184+
#[inline] fn from_f32(value: f32) -> Self { value as u32 }
185+
#[inline] fn from_u32(value: u32) -> Self { value }
138186
}
139187

140188
impl FromNativeSample for f16 {
141-
fn from_f16(value: f16) -> Self { value }
142-
fn from_f32(value: f32) -> Self { f16::from_f32(value) }
143-
fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
189+
#[inline] fn from_f16(value: f16) -> Self { value }
190+
#[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) }
191+
#[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
192+
193+
// f16 is a custom type
194+
// so the compiler can not automatically vectorize the conversion
195+
// that's why we need to specialize this function
196+
#[inline]
197+
fn from_f32s(from: &[f32], to: &mut [Self]) {
198+
to.convert_from_f32_slice(from)
199+
}
144200
}
145201

146202
impl FromNativeSample for Sample {
147-
fn from_f16(value: f16) -> Self { Self::from(value) }
148-
fn from_f32(value: f32) -> Self { Self::from(value) }
149-
fn from_u32(value: u32) -> Self { Self::from(value) }
203+
#[inline] fn from_f16(value: f16) -> Self { Self::from(value) }
204+
#[inline] fn from_f32(value: f32) -> Self { Self::from(value) }
205+
#[inline] fn from_u32(value: u32) -> Self { Self::from(value) }
150206
}
151207

152208

0 commit comments

Comments
 (0)