diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 1ff25db7..542b4cf4 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -47,13 +47,13 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Install or use cached foresterre/cargo-msrv
-        uses: baptiste0928/cargo-install@v1
-        with:
-          crate: cargo-msrv
+      - name: Install foresterre/cargo-msrv without cache (takes longer, but caching produces unexpected behaviour)
+        run: cargo install cargo-msrv
 
-      - name: Verify the Rustc version declared in `cargo.toml`
+      - name: Verify the Rustc version declared in `cargo.toml` without cache (takes longer, but caching produces unexpected behaviour)
         run: |
+          rm -f Cargo.lock
+          cargo update
           cargo-msrv verify
 
   # github actions does not support big endian systems directly, but it does support QEMU.
@@ -82,13 +82,11 @@ jobs:
         run: sudo systemctl start docker
 
       - name: Cross-Compile project to mips-unknown-linux-gnu
-        run: |
-          cross build --target=mips-unknown-linux-gnu --verbose
+        run: cross build --target=mips-unknown-linux-gnu --verbose
 
       # https://github.com/cross-rs/cross#supported-targets
       - name: Cross-Run Tests in mips-unknown-linux-gnu using Qemu
-        run: |
-          cross test --target mips-unknown-linux-gnu --verbose
+        run: cross test --target mips-unknown-linux-gnu --verbose
 
   wasm32:
     runs-on: ubuntu-latest
@@ -109,3 +107,4 @@ jobs:
       
     - name: Run tests without default features
       run: cargo test --verbose --no-default-features
+
diff --git a/Cargo.toml b/Cargo.toml
index 4d2adec4..51103f3f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,7 +28,7 @@ proc-macro = false
 
 [dependencies]
 lebe = "^0.5.2"                # generic binary serialization
-half = ">=2.1.0, <2.3"         # 16 bit float pixel data type
+half = ">=2.1.0, <2.3"               # 16 bit float pixel data type
 bit_field = "^0.10.1"          # exr file version bit flags
 miniz_oxide = "^0.7.1"         # zip compression for pxr24
 smallvec = "^1.7.0"            # make cache-friendly allocations        TODO profile if smallvec is really an improvement!
diff --git a/benches/pixel_format_conversion.rs b/benches/pixel_format_conversion.rs
index 705963cc..3d1ba930 100644
--- a/benches/pixel_format_conversion.rs
+++ b/benches/pixel_format_conversion.rs
@@ -8,62 +8,112 @@ use bencher::Bencher;
 use std::fs;
 use std::io::Cursor;
 use exr::image::pixel_vec::PixelVec;
+use exr::io::Data;
+use exr::block::samples::FromNativeSample;
+
+const F32_ZIPS_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zips.exr";
+const F32_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed.exr";
+const F16_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed_half.exr";
+const F16_ZIP_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zip_half.exr";
 
 /// Read an image from an in-memory buffer into its native f32 format
-fn read_image_rgba_f32_to_f32(bench: &mut Bencher) {
-    let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
-    bencher::black_box(&mut file);
+fn read_f32_as_f32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F32_UNCOMPRESSED_PATH, false);
+}
 
-    bench.iter(||{
-        let image = exr::prelude::read()
-            .no_deep_data().largest_resolution_level()
-            .rgba_channels(PixelVec::<(f32,f32,f32,f32)>::constructor, PixelVec::set_pixel)
-            .all_layers().all_attributes()
-            .non_parallel()
-            .from_buffered(Cursor::new(file.as_slice())).unwrap();
+/// Read image and convert the samples to u32 (from native f32)
+fn read_f32_as_u32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<u32>(bench, F32_UNCOMPRESSED_PATH, false);
+}
 
-        bencher::black_box(image);
-    })
+/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
+fn read_f32_as_f16_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F32_UNCOMPRESSED_PATH, false);
 }
 
-/// Read image and convert the samples to u32 (from native f32)
-fn read_image_rgba_f32_to_u32(bench: &mut Bencher) {
-    let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
-    bencher::black_box(&mut file);
+fn read_f16_as_f16_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F16_UNCOMPRESSED_PATH, false);
+}
 
-    bench.iter(||{
-        let image = exr::prelude::read()
-            .no_deep_data().largest_resolution_level()
-            .rgba_channels(PixelVec::<(u32,u32,u32,u32)>::constructor, PixelVec::set_pixel)
-            .all_layers().all_attributes()
-            .non_parallel()
-            .from_buffered(Cursor::new(file.as_slice())).unwrap();
+fn read_f16_as_f32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F16_UNCOMPRESSED_PATH, false);
+}
 
-        bencher::black_box(image);
-    })
+fn read_f16_as_u32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<u32>(bench, F16_UNCOMPRESSED_PATH, false);
 }
 
-/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
-fn read_image_rgba_f32_to_f16(bench: &mut Bencher) {
-    let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
+
+fn read_f32_as_f16_zips_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, false);
+}
+
+fn read_f16_as_f32_zip_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, false);
+}
+
+fn read_f32_as_f16_zips_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, true);
+}
+
+fn read_f16_as_f32_zip_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, true);
+}
+
+fn read_f32_as_f32_zips_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, true);
+}
+
+fn read_f16_as_f16_zip_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, true);
+}
+
+fn read_f32_as_f32_zips_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, false);
+}
+
+fn read_f16_as_f16_zip_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, false);
+}
+
+fn bench_read_image_rgba_as<T>(bench: &mut Bencher, path: &str, parallel: bool) {
+    let mut file = fs::read(path).unwrap();
     bencher::black_box(&mut file);
 
     bench.iter(||{
-        let image = exr::prelude::read()
-            .no_deep_data().largest_resolution_level()
-            .rgba_channels(PixelVec::<(f16,f16,f16,f16)>::constructor, PixelVec::set_pixel)
-            .all_layers().all_attributes()
-            .non_parallel()
-            .from_buffered(Cursor::new(file.as_slice())).unwrap();
-
+        let image = read_file_from_memory_as::<f16>(file.as_slice(), parallel);
         bencher::black_box(image);
     })
 }
 
+fn read_file_from_memory_as<T>(file: &[u8], parallel: bool) -> RgbaImage<PixelVec<(T, T, T, T)>>
+    where T: FromNativeSample
+{
+    let read = exr::prelude::read()
+        .no_deep_data().largest_resolution_level()
+        .rgba_channels(PixelVec::<(T, T, T, T)>::constructor, PixelVec::set_pixel)
+        .first_valid_layer().all_attributes();
+
+    let read = if parallel { read } else { read.non_parallel() };
+    read.from_buffered(Cursor::new(file)).unwrap()
+}
+
 benchmark_group!(pixel_format_conversion,
-    read_image_rgba_f32_to_f32,
-    read_image_rgba_f32_to_u32,
-    read_image_rgba_f32_to_f16,
+    read_f32_as_f32_uncompressed_1thread,
+    read_f32_as_u32_uncompressed_1thread,
+    read_f32_as_f16_uncompressed_1thread,
+    read_f32_as_f16_zips_1thread,
+    read_f32_as_f16_zips_nthreads,
+    read_f32_as_f32_zips_nthreads,
+    read_f32_as_f32_zips_1thread,
+
+    read_f16_as_f16_uncompressed_1thread,
+    read_f16_as_u32_uncompressed_1thread,
+    read_f16_as_f32_uncompressed_1thread,
+    read_f16_as_f32_zip_1thread,
+    read_f16_as_f32_zip_nthreads,
+    read_f16_as_f16_zip_nthreads,
+    read_f16_as_f16_zip_1thread,
 );
 
 benchmark_main!(pixel_format_conversion);
\ No newline at end of file
diff --git a/src/block/samples.rs b/src/block/samples.rs
index 90485fd2..4352b111 100644
--- a/src/block/samples.rs
+++ b/src/block/samples.rs
@@ -1,6 +1,7 @@
 //! Extract pixel samples from a block of pixel bytes.
 
 use crate::prelude::*;
+use half::prelude::HalfFloatSliceExt;
 
 
 /// A single red, green, blue, or alpha value.
@@ -112,6 +113,7 @@ impl From<Sample> for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() }
 
 /// Create an arbitrary sample type from one of the defined sample types.
 /// Should be compiled to a no-op where the file contains the predicted sample type.
+/// The slice functions should be optimized into a `memcpy` where there is no conversion needed.
 pub trait FromNativeSample: Sized + Copy + Default + 'static {
 
     /// Create this sample from a f16, trying to represent the same numerical value
@@ -122,31 +124,85 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static {
 
     /// Create this sample from a u32, trying to represent the same numerical value
     fn from_u32(value: u32) -> Self;
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    /// Note that this default implementation will **not** be vectorized by the compiler automatically.
+    /// For maximum performance you will need to override this function and implement it via
+    /// an explicit batched conversion such as [`convert_to_f32_slice`](https://docs.rs/half/2.3.1/half/slice/trait.HalfFloatSliceExt.html#tymethod.convert_to_f32_slice)
+    #[inline]
+    fn from_f16s(from: &[f16], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_f16(*from);
+        }
+    }
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    /// Note that this default implementation will be vectorized by the compiler automatically.
+    #[inline]
+    fn from_f32s(from: &[f32], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_f32(*from);
+        }
+    }
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    /// Note that this default implementation will be vectorized by the compiler automatically,
+    /// provided that the CPU supports the necessary conversion instructions.
+    /// For example, x86_64 lacks the instructions to convert `u32` to floats,
+    /// so this will inevitably be slow on x86_64.
+    #[inline]
+    fn from_u32s(from: &[u32], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_u32(*from);
+        }
+    }
 }
 
 // TODO haven't i implemented this exact behaviour already somewhere else in this library...??
 impl FromNativeSample for f32 {
-    fn from_f16(value: f16) -> Self { value.to_f32() }
-    fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output
-    fn from_u32(value: u32) -> Self { value as f32 }
+    #[inline] fn from_f16(value: f16) -> Self { value.to_f32() }
+    #[inline] fn from_f32(value: f32) -> Self { value }
+    #[inline] fn from_u32(value: u32) -> Self { value as f32 }
+
+    // f16 is a custom type
+    // so the compiler can not automatically vectorize the conversion
+    // that's why we need to specialize this function
+    #[inline]
+    fn from_f16s(from: &[f16], to: &mut [Self]) {
+        from.convert_to_f32_slice(to);
+    }
 }
 
 impl FromNativeSample for u32 {
-    fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
-    fn from_f32(value: f32) -> Self { value as u32 }
-    fn from_u32(value: u32) -> Self { value }
+    #[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
+    #[inline] fn from_f32(value: f32) -> Self { value as u32 }
+    #[inline] fn from_u32(value: u32) -> Self { value }
 }
 
 impl FromNativeSample for f16 {
-    fn from_f16(value: f16) -> Self { value }
-    fn from_f32(value: f32) -> Self { f16::from_f32(value) }
-    fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
+    #[inline] fn from_f16(value: f16) -> Self { value }
+    #[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) }
+    #[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
+
+    // f16 is a custom type
+    // so the compiler can not automatically vectorize the conversion
+    // that's why we need to specialize this function
+    #[inline]
+    fn from_f32s(from: &[f32], to: &mut [Self]) {
+        to.convert_from_f32_slice(from)
+    }
 }
 
 impl FromNativeSample for Sample {
-    fn from_f16(value: f16) -> Self { Self::from(value) }
-    fn from_f32(value: f32) -> Self { Self::from(value) }
-    fn from_u32(value: u32) -> Self { Self::from(value) }
+    #[inline] fn from_f16(value: f16) -> Self { Self::from(value) }
+    #[inline] fn from_f32(value: f32) -> Self { Self::from(value) }
+    #[inline] fn from_u32(value: u32) -> Self { Self::from(value) }
 }
 
 
diff --git a/src/image/read/specific_channels.rs b/src/image/read/specific_channels.rs
index cc7f1abc..375691c4 100644
--- a/src/image/read/specific_channels.rs
+++ b/src/image/read/specific_channels.rs
@@ -12,6 +12,7 @@ use crate::image::read::layers::{ChannelsReader, ReadChannels};
 use crate::block::chunk::TileCoordinates;
 
 use std::marker::PhantomData;
+use crate::io::Read;
 
 
 /// Can be attached one more channel reader.
@@ -279,30 +280,121 @@ pub struct OptionalSampleReader<DefaultSample> {
 impl<Sample: FromNativeSample> SampleReader<Sample> {
     fn read_own_samples<'s, FullPixel>(
         &self, bytes: &'s[u8], pixels: &mut [FullPixel],
-        get_pixel: impl Fn(&mut FullPixel) -> &mut Sample
+        get_sample: impl Fn(&mut FullPixel) -> &mut Sample
     ){
         let start_index = pixels.len() * self.channel_byte_offset;
         let byte_count = pixels.len() * self.channel.sample_type.bytes_per_sample();
-        let mut own_bytes_reader = &bytes[start_index .. start_index + byte_count]; // TODO check block size somewhere
+        let mut own_bytes_reader = &mut &bytes[start_index .. start_index + byte_count]; // TODO check block size somewhere
+        let mut samples_out = pixels.iter_mut().map(|pixel| get_sample(pixel));
 
-        let error_msg = "error when reading from in-memory slice";
-
-        // match outside the loop to avoid matching on every single sample
+        // match the type once for the whole line, not on every single sample
         match self.channel.sample_type {
-            SampleType::F16 => for pixel in pixels.iter_mut() {
-                *get_pixel(pixel) = Sample::from_f16(f16::read(&mut own_bytes_reader).expect(error_msg));
-            },
+            SampleType::F16 => read_and_convert_all_samples_batched(
+                &mut own_bytes_reader, &mut samples_out,
+                Sample::from_f16s
+            ),
+
+            SampleType::F32 => read_and_convert_all_samples_batched(
+                &mut own_bytes_reader, &mut samples_out,
+                Sample::from_f32s
+            ),
+
+            SampleType::U32 => read_and_convert_all_samples_batched(
+                &mut own_bytes_reader, &mut samples_out,
+                Sample::from_u32s
+            ),
+        }
+
+        debug_assert!(samples_out.next().is_none(), "not all samples have been converted");
+        debug_assert!(own_bytes_reader.is_empty(), "bytes left after reading all samples");
+    }
+}
 
-            SampleType::F32 => for pixel in pixels.iter_mut() {
-                *get_pixel(pixel) = Sample::from_f32(f32::read(&mut own_bytes_reader).expect(error_msg));
-            },
 
-            SampleType::U32 => for pixel in pixels.iter_mut() {
-                *get_pixel(pixel) = Sample::from_u32(u32::read(&mut own_bytes_reader).expect(error_msg));
-            },
+/// Does the same as `convert_batch(in_bytes.chunks().map(From::from_bytes))`, but vectorized.
+/// Reads the samples for one line, using the sample type specified in the file,
+/// and then converts those to the desired sample types.
+/// Uses batches to allow vectorization, converting multiple values with one instruction.
+fn read_and_convert_all_samples_batched<'t, From, To>(
+    mut in_bytes: impl Read,
+    out_samples: &mut impl ExactSizeIterator<Item=&'t mut To>,
+    convert_batch: fn(&[From], &mut [To])
+) where From: Data + Default + Copy, To: 't + Default + Copy
+{
+    // this is not a global! why is this warning triggered?
+    #[allow(non_upper_case_globals)]
+    const batch_size: usize = 16;
+
+    let total_sample_count = out_samples.len();
+    let batch_count = total_sample_count / batch_size;
+    let remaining_samples_count = total_sample_count % batch_size;
+
+    let len_error_msg = "sample count was miscalculated";
+    let byte_error_msg = "error when reading from in-memory slice";
+
+    // write samples from a given slice to the output iterator. should be inlined.
+    let output_n_samples = &mut move |samples: &[To]| {
+        for converted_sample in samples {
+            *out_samples.next().expect(len_error_msg) = *converted_sample;
         }
+    };
+
+    // read samples from the byte source into a given slice. should be inlined.
+    // todo: use #[inline] when available
+    // error[E0658]: attributes on expressions are experimental,
+    // see issue #15701 <https://github.com/rust-lang/rust/issues/15701> for more information
+    let read_n_samples = &mut move |samples: &mut [From]| {
+        Data::read_slice(&mut in_bytes, samples).expect(byte_error_msg);
+    };
+
+    // temporary arrays with fixed size, operations should be vectorized within these arrays
+    let mut source_samples_batch: [From; batch_size] = Default::default();
+    let mut desired_samples_batch: [To; batch_size] = Default::default();
+
+    // first convert all whole batches, size statically known to be 16 element arrays
+    for _ in 0 .. batch_count {
+        read_n_samples(&mut source_samples_batch);
+        convert_batch(source_samples_batch.as_slice(), desired_samples_batch.as_mut_slice());
+        output_n_samples(&desired_samples_batch);
+    }
 
-        debug_assert!(own_bytes_reader.is_empty(), "bytes left after reading all samples");
+    // then convert a partial remaining batch, size known only at runtime
+    if remaining_samples_count != 0 {
+        let source_samples_batch = &mut source_samples_batch[..remaining_samples_count];
+        let desired_samples_batch = &mut desired_samples_batch[..remaining_samples_count];
+
+        read_n_samples(source_samples_batch);
+        convert_batch(source_samples_batch, desired_samples_batch);
+        output_n_samples(desired_samples_batch);
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn equals_naive_f32(){
+        for total_array_size in [3, 7, 30, 41, 120, 10_423] {
+            let input_f32s = (0..total_array_size).map(|_| rand::random::<f32>()).collect::<Vec<f32>>();
+            let in_f32s_bytes = input_f32s.iter().cloned().flat_map(f32::to_le_bytes).collect::<Vec<u8>>();
+
+            let mut out_f16_samples_batched = vec![
+                f16::from_f32(rand::random::<f32>());
+                total_array_size
+            ];
+
+            read_and_convert_all_samples_batched(
+                &mut in_f32s_bytes.as_slice(),
+                &mut out_f16_samples_batched.iter_mut(),
+                f16::from_f32s
+            );
+
+            let out_f16_samples_naive = input_f32s.iter()
+                .cloned().map(f16::from_f32);
+
+            assert!(out_f16_samples_naive.eq(out_f16_samples_batched));
+        }
     }
 }
 
diff --git a/src/math.rs b/src/math.rs
index 50b4c853..9f21bf1a 100644
--- a/src/math.rs
+++ b/src/math.rs
@@ -194,9 +194,15 @@ impl RoundingMode {
         }
     }
 
+    /// Only works for positive numbers.
     pub(crate) fn divide<T>(self, dividend: T, divisor: T) -> T
-        where T: Copy + Add<Output = T> + Sub<Output = T> + Div<Output = T> + From<u8>
+        where T: Copy + Add<Output = T> + Sub<Output = T> + Div<Output = T> + From<u8> + std::cmp::PartialOrd
     {
+        assert!(
+            dividend >= T::from(0) && divisor >= T::from(1),
+            "division with rounding up only works for positive numbers"
+        );
+
         match self {
             RoundingMode::Up => (dividend + divisor - T::from(1_u8)) / divisor, // only works for positive numbers
             RoundingMode::Down => dividend / divisor,
diff --git a/tests/images/valid/custom/crowskull/crow_uncompressed_half.exr b/tests/images/valid/custom/crowskull/crow_uncompressed_half.exr
new file mode 100644
index 00000000..3b9257f2
Binary files /dev/null and b/tests/images/valid/custom/crowskull/crow_uncompressed_half.exr differ