From 862c37684d1f158fe948a61b3338840de2db07a3 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Sun, 14 Sep 2025 11:37:19 +0200 Subject: [PATCH 1/8] Improve SIMD usage of Oklab conversions --- src/color/oklab.rs | 136 +++++++++++++++++++++++++++++++++------------ 1 file changed, 100 insertions(+), 36 deletions(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index 0f14d57..6a660cf 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -4,6 +4,24 @@ trait Operations { fn srgb_to_linear(c: f32) -> f32; fn linear_to_srgb(c: f32) -> f32; fn cbrt(x: f32) -> f32; + + fn srgb_to_linear_vec(x: Vec3A) -> Vec3A { + Vec3A::new( + Self::srgb_to_linear(x.x), + Self::srgb_to_linear(x.y), + Self::srgb_to_linear(x.z), + ) + } + fn linear_to_srgb_vec(x: Vec3A) -> Vec3A { + Vec3A::new( + Self::linear_to_srgb(x.x), + Self::linear_to_srgb(x.y), + Self::linear_to_srgb(x.z), + ) + } + fn cbrt_vec(x: Vec3A) -> Vec3A { + Vec3A::new(Self::cbrt(x.x), Self::cbrt(x.y), Self::cbrt(x.z)) + } } struct Reference; @@ -73,50 +91,96 @@ impl Operations for Fast { t *= G + F / (s + E + D / s); t } -} - -#[allow(clippy::excessive_precision)] -fn srgb_to_oklab_impl(rgb: Vec3A) -> Vec3A { - let [r, g, b] = rgb.to_array().map(O::srgb_to_linear); - let mut l = 0.4122214708 * r + 0.5363325363 * g + 0.0514459929 * b; - let mut m = 0.2119034982 * r + 0.6806995451 * g + 0.1073969566 * b; - let mut s = 0.0883024619 * r + 0.2817188376 * g + 0.6299787005 * b; + fn srgb_to_linear_vec(c: Vec3A) -> Vec3A { + Vec3A::select( + c.cmpge(Vec3A::splat(0.04045)), + { + // This uses a Padé approximant for ((c + 0.055) / 1.055) ^ 2.4: + // (0.000857709 +0.0359438 x+0.524293 x^2+1.31193 x^3)/(1+0.992498 x-0.119725 x^2) + let c2 = c * c; + let c3 = c2 * c; + Vec3A::min( + Vec3A::ONE, + (0.000857709 + 0.0359438 * c + 0.524293 * c2 + 1.31193 * c3) + / (Vec3A::ONE + 0.992498 * c - 0.119725 * c2), + ) + }, + c * (1.0 / 12.92), + ) + } + fn linear_to_srgb_vec(c: Vec3A) -> Vec3A { + Vec3A::select( + c.cmpgt(Vec3A::splat(0.0031308)), + { + // This uses a Padé approximant for 1.055 c^(1/2.4) - 0.055: + // (-0.0117264+21.0897 x+949.46 x^2+2225.62 x^3)/(1+176.398 x+1983.15 x^2+1035.65 x^3) + let c2 = c * c; + let c3 = c2 * c; + (-0.0117264 + 21.0897 * c + 949.46 * c2 + 2225.62 * c3) + / (1.0 + 176.398 * c + 1983.15 * c2 + 1035.65 * c3) + }, + c * 12.92, + ) + } + #[allow(clippy::excessive_precision)] + fn cbrt_vec(x: Vec3A) -> Vec3A { + // This is the fast cbrt approximation from the oklab crate. + // Source: https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61 + // Author: Kornel (https://gitlab.com/kornelski/) + const B: u32 = 709957561; + const C: f32 = 5.4285717010e-1; + const D: f32 = -7.0530611277e-1; + const E: f32 = 1.4142856598e+0; + const F: f32 = 1.6071428061e+0; + const G: f32 = 3.5714286566e-1; - l = O::cbrt(l); - m = O::cbrt(m); - s = O::cbrt(s); + let mut t = Vec3A::from_array( + x.to_array() + .map(|x| f32::from_bits((x.to_bits() / 3).wrapping_add(B))), + ); + let s = C + (t * t) * (t / x); + t *= G + F / (s + E + D / s); + t + } +} - let l_final = l * 0.2104542553 + m * 0.7936177850 + s * -0.0040720468; - let a = l * 1.9779984951 + m * -2.4285922050 + s * 0.4505937099; - let b = l * 0.0259040371 + m * 0.7827717662 + s * -0.8086757660; +#[allow(clippy::excessive_precision)] +fn srgb_to_oklab_impl(srgb: Vec3A) -> Vec3A { + let rgb = O::srgb_to_linear_vec(srgb); + + let lms = Vec3A::new( + rgb.dot(Vec3A::new(0.4122214708, 0.5363325363, 0.0514459929)), + rgb.dot(Vec3A::new(0.2119034982, 0.6806995451, 0.1073969566)), + rgb.dot(Vec3A::new(0.0883024619, 0.2817188376, 0.6299787005)), + ); + let lms = O::cbrt_vec(lms); + + let lab = Vec3A::new( + lms.dot(Vec3A::new(0.2104542553, 0.7936177850, -0.0040720468)), + lms.dot(Vec3A::new(1.9779984951, -2.4285922050, 0.4505937099)), + lms.dot(Vec3A::new(0.0259040371, 0.7827717662, -0.8086757660)), + ); // normalize everything to the 0..1 range - Vec3A::new(l_final, a + 0.5, b + 0.5) + lab + Vec3A::new(0.0, 0.5, 0.5) } #[allow(clippy::excessive_precision)] fn oklab_to_srgb_impl(lab: Vec3A) -> Vec3A { - let l_org = lab.x; - let a = lab.y - 0.5; - let b = lab.z - 0.5; - - let mut l = l_org + a * 0.3963377774 + b * 0.2158037573; - let mut m = l_org + a * -0.1055613458 + b * -0.0638541728; - let mut s = l_org + a * -0.0894841775 + b * -1.2914855480; - - l = l * l * l; - m = m * m * m; - s = s * s * s; - - let r = l * 4.0767416621 + m * -3.3077115913 + s * 0.2309699292; - let g = l * -1.2684380046 + m * 2.6097574011 + s * -0.3413193965; - let b = l * -0.0041960863 + m * -0.7034186147 + s * 1.7076147010; - - Vec3A::new( - O::linear_to_srgb(r), - O::linear_to_srgb(g), - O::linear_to_srgb(b), - ) + let lab_norm = lab - Vec3A::new(0.0, 0.5, 0.5); + let lms = Vec3A::new( + lab_norm.dot(Vec3A::new(1.0, 0.3963377774, 0.2158037573)), + lab_norm.dot(Vec3A::new(1.0, -0.1055613458, -0.0638541728)), + lab_norm.dot(Vec3A::new(1.0, -0.0894841775, -1.2914855480)), + ); + let lms = lms * lms * lms; // lms^3 + let rgb = Vec3A::new( + lms.dot(Vec3A::new(4.0767416621, -3.3077115913, 0.2309699292)), + lms.dot(Vec3A::new(-1.2684380046, 2.6097574011, -0.3413193965)), + lms.dot(Vec3A::new(-0.0041960863, -0.7034186147, 1.7076147010)), + ); + + O::linear_to_srgb_vec(rgb) } #[allow(unused)] From dcb151ac63e9b9d50cdd7a2b6b165767165c4e27 Mon Sep 17 00:00:00 2001 From: RunDevelopment Date: Wed, 17 Sep 2025 14:17:22 +0200 Subject: [PATCH 2/8] Remove unused scalar implementations --- src/color/oklab.rs | 156 ++++++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 94 deletions(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index 6a660cf..fac653e 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -1,98 +1,51 @@ use glam::Vec3A; trait Operations { - fn srgb_to_linear(c: f32) -> f32; - fn linear_to_srgb(c: f32) -> f32; - fn cbrt(x: f32) -> f32; - - fn srgb_to_linear_vec(x: Vec3A) -> Vec3A { - Vec3A::new( - Self::srgb_to_linear(x.x), - Self::srgb_to_linear(x.y), - Self::srgb_to_linear(x.z), - ) - } - fn linear_to_srgb_vec(x: Vec3A) -> Vec3A { - Vec3A::new( - Self::linear_to_srgb(x.x), - Self::linear_to_srgb(x.y), - Self::linear_to_srgb(x.z), - ) - } - fn cbrt_vec(x: Vec3A) -> Vec3A { - Vec3A::new(Self::cbrt(x.x), Self::cbrt(x.y), Self::cbrt(x.z)) - } + fn srgb_to_linear(c: Vec3A) -> Vec3A; + fn linear_to_srgb(c: Vec3A) -> Vec3A; + fn cbrt(x: Vec3A) -> Vec3A; } struct Reference; impl Operations for Reference { - fn srgb_to_linear(c: f32) -> f32 { - if c >= 0.04045 { - ((c + 0.055) / 1.055).powf(2.4) - } else { - c / 12.92 + fn srgb_to_linear(c: Vec3A) -> Vec3A { + fn srgb_to_linear(c: f32) -> f32 { + if c >= 0.04045 { + ((c + 0.055) / 1.055).powf(2.4) + } else { + c / 12.92 + } } + + Vec3A::new( + srgb_to_linear(c.x), + srgb_to_linear(c.y), + srgb_to_linear(c.z), + ) } - fn linear_to_srgb(c: f32) -> f32 { - if c > 0.0031308 { - 1.055 * c.powf(1.0 / 2.4) - 0.055 - } else { - 12.92 * c + fn linear_to_srgb(c: Vec3A) -> Vec3A { + fn linear_to_srgb(c: f32) -> f32 { + if c > 0.0031308 { + 1.055 * c.powf(1.0 / 2.4) - 0.055 + } else { + 12.92 * c + } } + + Vec3A::new( + linear_to_srgb(c.x), + linear_to_srgb(c.y), + linear_to_srgb(c.z), + ) } - fn cbrt(x: f32) -> f32 { - f32::cbrt(x) + fn cbrt(x: Vec3A) -> Vec3A { + Vec3A::new(x.x.cbrt(), x.y.cbrt(), x.z.cbrt()) } } struct Fast; impl Operations for Fast { - fn srgb_to_linear(c: f32) -> f32 { - if c >= 0.04045 { - // This uses a Padé approximant for ((c + 0.055) / 1.055) ^ 2.4: - // (0.000857709 +0.0359438 x+0.524293 x^2+1.31193 x^3)/(1+0.992498 x-0.119725 x^2) - let c2 = c * c; - let c3 = c2 * c; - f32::min( - 1.0, - (0.000857709 + 0.0359438 * c + 0.524293 * c2 + 1.31193 * c3) - / (1.0 + 0.992498 * c - 0.119725 * c2), - ) - } else { - c * (1.0 / 12.92) - } - } - fn linear_to_srgb(c: f32) -> f32 { - if c > 0.0031308 { - // This uses a Padé approximant for 1.055 c^(1/2.4) - 0.055: - // (-0.0117264+21.0897 x+949.46 x^2+2225.62 x^3)/(1+176.398 x+1983.15 x^2+1035.65 x^3) - let c2 = c * c; - let c3 = c2 * c; - (-0.0117264 + 21.0897 * c + 949.46 * c2 + 2225.62 * c3) - / (1.0 + 176.398 * c + 1983.15 * c2 + 1035.65 * c3) - } else { - 12.92 * c - } - } - #[allow(clippy::excessive_precision)] - fn cbrt(x: f32) -> f32 { - // This is the fast cbrt approximation from the oklab crate. - // Source: https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61 - // Author: Kornel (https://gitlab.com/kornelski/) - const B: u32 = 709957561; - const C: f32 = 5.4285717010e-1; - const D: f32 = -7.0530611277e-1; - const E: f32 = 1.4142856598e+0; - const F: f32 = 1.6071428061e+0; - const G: f32 = 3.5714286566e-1; - - let mut t = f32::from_bits((x.to_bits() / 3).wrapping_add(B)); - let s = C + (t * t) * (t / x); - t *= G + F / (s + E + D / s); - t - } - - fn srgb_to_linear_vec(c: Vec3A) -> Vec3A { + fn srgb_to_linear(c: Vec3A) -> Vec3A { Vec3A::select( c.cmpge(Vec3A::splat(0.04045)), { @@ -109,7 +62,7 @@ impl Operations for Fast { c * (1.0 / 12.92), ) } - fn linear_to_srgb_vec(c: Vec3A) -> Vec3A { + fn linear_to_srgb(c: Vec3A) -> Vec3A { Vec3A::select( c.cmpgt(Vec3A::splat(0.0031308)), { @@ -124,7 +77,7 @@ impl Operations for Fast { ) } #[allow(clippy::excessive_precision)] - fn cbrt_vec(x: Vec3A) -> Vec3A { + fn cbrt(x: Vec3A) -> Vec3A { // This is the fast cbrt approximation from the oklab crate. // Source: https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61 // Author: Kornel (https://gitlab.com/kornelski/) @@ -147,14 +100,14 @@ impl Operations for Fast { #[allow(clippy::excessive_precision)] fn srgb_to_oklab_impl(srgb: Vec3A) -> Vec3A { - let rgb = O::srgb_to_linear_vec(srgb); + let rgb = O::srgb_to_linear(srgb); let lms = Vec3A::new( rgb.dot(Vec3A::new(0.4122214708, 0.5363325363, 0.0514459929)), rgb.dot(Vec3A::new(0.2119034982, 0.6806995451, 0.1073969566)), rgb.dot(Vec3A::new(0.0883024619, 0.2817188376, 0.6299787005)), ); - let lms = O::cbrt_vec(lms); + let lms = O::cbrt(lms); let lab = Vec3A::new( lms.dot(Vec3A::new(0.2104542553, 0.7936177850, -0.0040720468)), @@ -180,7 +133,7 @@ fn oklab_to_srgb_impl(lab: Vec3A) -> Vec3A { lms.dot(Vec3A::new(-0.0041960863, -0.7034186147, 1.7076147010)), ); - O::linear_to_srgb_vec(rgb) + O::linear_to_srgb(rgb) } #[allow(unused)] @@ -264,43 +217,58 @@ mod tests { } } + pub struct Scalar(O); + impl Scalar { + fn srgb_to_linear(c: f32) -> f32 { + O::srgb_to_linear(Vec3A::splat(c)).x + } + fn linear_to_srgb(c: f32) -> f32 { + O::linear_to_srgb(Vec3A::splat(c)).x + } + fn cbrt(x: f32) -> f32 { + O::cbrt(Vec3A::splat(x)).x + } + } + type RefScalar = Scalar; + type FastScalar = Scalar; + #[test] fn test_linear_srgb() { for c in 0..=255 { let c = c as f32 / 255.0; - let l = Reference::srgb_to_linear(c); - let c2 = Reference::linear_to_srgb(l); + let l = RefScalar::srgb_to_linear(c); + let c2 = RefScalar::linear_to_srgb(l); assert!((c - c2).abs() < 1e-6, "{c} -> {c2}"); } for c in 0..=255 { let c = c as f32 / 255.0; - let l = Fast::srgb_to_linear(c); - let c2 = Fast::linear_to_srgb(l); + let l = FastScalar::srgb_to_linear(c); + let c2 = FastScalar::linear_to_srgb(l); assert!((c - c2).abs() < 2.5e-3, "{c} -> {c2}"); assert!((0.0..=1.0).contains(&l), "{c} -> {l}"); assert!((0.0..=1.0).contains(&c2), "{c} -> {l}"); } - assert_eq!(Reference::srgb_to_linear(0.0), 0.0); - assert!((Reference::srgb_to_linear(1.0) - 1.0).abs() < 1e-6); - assert_eq!(Fast::linear_to_srgb(0.0), 0.0); - assert!((Fast::srgb_to_linear(1.0) - 1.0).abs() < 1e-6); + assert_eq!(RefScalar::srgb_to_linear(0.0), 0.0); + assert!((RefScalar::srgb_to_linear(1.0) - 1.0).abs() < 1e-6); + assert_eq!(FastScalar::linear_to_srgb(0.0), 0.0); + assert!((FastScalar::srgb_to_linear(1.0) - 1.0).abs() < 1e-6); } #[test] fn test_error_fast_srgb_to_linear() { assert_eq!( - get_error_stats(Reference::srgb_to_linear, Fast::srgb_to_linear), + get_error_stats(RefScalar::srgb_to_linear, FastScalar::srgb_to_linear), "Error: avg=0.00002514 max=0.00013047 for 0.999" ); } #[test] fn test_error_fast_linear_to_srgb() { assert_eq!( - get_error_stats(Reference::linear_to_srgb, Fast::linear_to_srgb), + get_error_stats(RefScalar::linear_to_srgb, FastScalar::linear_to_srgb), "Error: avg=0.00105457 max=0.00236702 for 0.732" ); } From 908cc9b47105462a03be3ce11142f1bc0bb8bcae Mon Sep 17 00:00:00 2001 From: RunDevelopment Date: Sat, 20 Sep 2025 14:05:43 +0200 Subject: [PATCH 3/8] Faster approximation for sRGB to linear --- src/color/oklab.rs | 16 +-- test-data/encode_quality.md | 124 +++++++++--------- .../output-encode/compression/_hashes.yml | 40 +++--- 3 files changed, 89 insertions(+), 91 deletions(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index fac653e..8669135 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -44,20 +44,19 @@ impl Operations for Reference { } struct Fast; +#[allow(clippy::excessive_precision)] impl Operations for Fast { fn srgb_to_linear(c: Vec3A) -> Vec3A { Vec3A::select( c.cmpge(Vec3A::splat(0.04045)), { - // This uses a Padé approximant for ((c + 0.055) / 1.055) ^ 2.4: - // (0.000857709 +0.0359438 x+0.524293 x^2+1.31193 x^3)/(1+0.992498 x-0.119725 x^2) + // Polynomial approximation for ((c + 0.055) / 1.055) ^ 2.4 + // This has a max error of 0.0001228 and is exact at c=0.04045 and c=1 let c2 = c * c; let c3 = c2 * c; - Vec3A::min( - Vec3A::ONE, - (0.000857709 + 0.0359438 * c + 0.524293 * c2 + 1.31193 * c3) - / (Vec3A::ONE + 0.992498 * c - 0.119725 * c2), - ) + let c4 = c2 * c2; + + 0.00117465 + 0.02381997 * c + 0.58750746 * c2 + 0.47736490 * c3 + -0.08986699 * c4 }, c * (1.0 / 12.92), ) @@ -76,7 +75,6 @@ impl Operations for Fast { c * 12.92, ) } - #[allow(clippy::excessive_precision)] fn cbrt(x: Vec3A) -> Vec3A { // This is the fast cbrt approximation from the oklab crate. // Source: https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61 @@ -262,7 +260,7 @@ mod tests { fn test_error_fast_srgb_to_linear() { assert_eq!( get_error_stats(RefScalar::srgb_to_linear, FastScalar::srgb_to_linear), - "Error: avg=0.00002514 max=0.00013047 for 0.999" + "Error: avg=0.00007546 max=0.00012285 for 0.637" ); } #[test] diff --git a/test-data/encode_quality.md b/test-data/encode_quality.md index 2b90737..d8414d0 100644 --- a/test-data/encode_quality.md +++ b/test-data/encode_quality.md @@ -44,15 +44,15 @@ | | | A | 22.66 | 38.14 | 0.56 | | | | | | | | perc | L | 28.83 | 33.50 | 1.23 -| | | R | 18.96 | 20.96 | 5.47 -| | | G | 25.13 | 29.60 | 2.61 +| | | R | 18.95 | 20.96 | 5.47 +| | | G | 25.13 | 29.59 | 2.61 | | | B | 22.10 | 25.11 | 3.69 | | | A | 24.22 | 29.16 | 1.64 | | | | | | -| | perc d | L | 28.68 | 33.54 | 1.16 +| | perc d | L | 28.67 | 33.54 | 1.16 | | | R | 18.94 | 20.97 | 5.33 -| | | G | 25.03 | 29.62 | 2.53 -| | | B | 22.06 | 25.12 | 3.60 +| | | G | 25.02 | 29.62 | 2.53 +| | | B | 22.07 | 25.12 | 3.60 | | | A | 24.22 | 29.16 | 1.64 | | | | | | | | | | | | @@ -81,14 +81,14 @@ | | | A | 15.65 | 32.53 | 1.63 | | | | | | | | perc | L | 21.62 | 24.11 | 6.16 -| | | R | 10.61 | 11.59 | 27.95 -| | | G | 20.48 | 22.99 | 8.69 -| | | B | 13.69 | 15.91 | 15.96 +| | | R | 10.61 | 11.59 | 27.98 +| | | G | 20.48 | 22.98 | 8.70 +| | | B | 13.70 | 15.91 | 15.93 | | | A | 17.80 | 20.01 | 9.10 | | | | | | -| | perc d | L | 21.51 | 24.14 | 6.01 -| | | R | 10.61 | 11.60 | 27.75 -| | | G | 20.37 | 23.01 | 8.52 +| | perc d | L | 21.51 | 24.14 | 6.02 +| | | R | 10.61 | 11.60 | 27.78 +| | | G | 20.36 | 23.00 | 8.52 | | | B | 13.67 | 15.91 | 15.98 | | | A | 17.80 | 20.01 | 9.10 | | | | | | @@ -113,15 +113,15 @@ | | | G | 44.93 | 53.22 | 0.43 | | | B | 45.01 | 53.10 | 0.39 | | | | | | -| | perc | L | 52.54 | 59.58 | 0.21 -| | | R | 42.59 | 47.57 | 0.81 -| | | G | 45.63 | 51.86 | 0.52 -| | | B | 43.69 | 49.32 | 0.71 +| | perc | L | 52.55 | 59.60 | 0.21 +| | | R | 42.60 | 47.59 | 0.82 +| | | G | 45.60 | 51.80 | 0.52 +| | | B | 43.66 | 49.31 | 0.71 | | | | | | -| | perc d | L | 51.81 | 59.94 | 0.19 -| | | R | 42.27 | 47.79 | 0.74 -| | | G | 45.17 | 52.33 | 0.47 -| | | B | 43.16 | 49.40 | 0.63 +| | perc d | L | 51.81 | 59.95 | 0.19 +| | | R | 42.30 | 47.85 | 0.74 +| | | G | 45.19 | 52.39 | 0.47 +| | | B | 43.17 | 49.41 | 0.63 | | | | | | | | | | | | | bricks-d.png | fast | L | 34.19 | 44.72 | 0.65 @@ -144,15 +144,15 @@ | | | G | 32.19 | 43.50 | 0.72 | | | B | 31.42 | 42.23 | 1.02 | | | | | | -| | perc | L | 35.67 | 47.50 | 0.40 +| | perc | L | 35.68 | 47.53 | 0.39 | | | R | 30.18 | 39.01 | 0.96 -| | | G | 31.87 | 41.32 | 0.88 -| | | B | 30.42 | 39.26 | 1.46 +| | | G | 31.87 | 41.32 | 0.89 +| | | B | 30.42 | 39.30 | 1.46 | | | | | | -| | perc d | L | 34.88 | 48.34 | 0.31 -| | | R | 29.91 | 39.36 | 0.77 -| | | G | 31.45 | 41.93 | 0.79 -| | | B | 30.12 | 39.71 | 1.35 +| | perc d | L | 34.87 | 48.34 | 0.31 +| | | R | 29.89 | 39.34 | 0.76 +| | | G | 31.44 | 41.96 | 0.79 +| | | B | 30.12 | 39.78 | 1.35 | | | | | | | | | | | | | bricks-n.png | fast | L | 32.99 | 43.84 | 0.60 @@ -175,14 +175,14 @@ | | | G | 27.20 | 38.44 | 0.66 | | | B | 29.34 | 39.15 | 1.04 | | | | | | -| | perc | L | 35.44 | 46.42 | 0.48 -| | | R | 24.33 | 34.33 | 1.60 -| | | G | 30.36 | 41.55 | 0.84 +| | perc | L | 35.44 | 46.43 | 0.48 +| | | R | 24.34 | 34.33 | 1.60 +| | | G | 30.36 | 41.54 | 0.84 | | | B | 28.97 | 38.39 | 1.33 | | | | | | | | perc d | L | 35.03 | 46.99 | 0.39 | | | R | 24.29 | 34.54 | 1.36 -| | | G | 29.88 | 42.00 | 0.66 +| | | G | 29.87 | 41.99 | 0.66 | | | B | 28.99 | 38.68 | 1.16 | | | | | | | | | | | | @@ -206,15 +206,15 @@ | | | G | 31.99 | 43.77 | 0.70 | | | B | 35.14 | 43.06 | 1.12 | | | | | | -| | perc | L | 35.88 | 47.83 | 0.45 -| | | R | 29.72 | 35.04 | 3.03 +| | perc | L | 35.88 | 47.81 | 0.45 +| | | R | 29.71 | 35.02 | 3.04 | | | G | 33.76 | 45.48 | 0.63 -| | | B | 31.34 | 36.91 | 2.89 +| | | B | 31.31 | 36.83 | 2.92 | | | | | | -| | perc d | L | 35.05 | 48.76 | 0.36 -| | | R | 29.95 | 35.92 | 2.76 -| | | G | 32.94 | 46.38 | 0.52 -| | | B | 31.49 | 37.34 | 2.72 +| | perc d | L | 35.04 | 48.77 | 0.36 +| | | R | 29.91 | 35.85 | 2.78 +| | | G | 32.93 | 46.40 | 0.52 +| | | B | 31.48 | 37.29 | 2.73 | | | | | | | | | | | | | clovers-r.png | fast | L | 33.85 | 45.05 | 0.84 @@ -237,15 +237,15 @@ | | | G | 31.71 | 45.22 | 0.84 | | | B | 31.73 | 45.68 | 0.67 | | | | | | -| | perc | L | 34.61 | 47.18 | 0.43 +| | perc | L | 34.62 | 47.20 | 0.43 | | | R | 32.53 | 43.70 | 1.10 -| | | G | 32.79 | 45.09 | 0.66 -| | | B | 32.45 | 43.52 | 1.09 +| | | G | 32.79 | 45.10 | 0.66 +| | | B | 32.46 | 43.54 | 1.09 | | | | | | -| | perc d | L | 33.73 | 48.29 | 0.34 +| | perc d | L | 33.73 | 48.27 | 0.34 | | | R | 31.72 | 44.64 | 0.98 -| | | G | 31.92 | 46.16 | 0.58 -| | | B | 31.70 | 44.79 | 0.94 +| | | G | 31.92 | 46.14 | 0.58 +| | | B | 31.70 | 44.80 | 0.94 | | | | | | | | | | | | | stone-d.png | fast | L | 34.83 | 45.46 | 0.80 @@ -268,15 +268,15 @@ | | | G | 34.22 | 46.78 | 0.71 | | | B | 34.01 | 45.31 | 0.91 | | | | | | -| | perc | L | 35.55 | 47.16 | 0.48 -| | | R | 34.33 | 43.81 | 1.20 -| | | G | 35.10 | 46.61 | 0.63 -| | | B | 34.36 | 43.75 | 1.18 +| | perc | L | 35.55 | 47.18 | 0.48 +| | | R | 34.32 | 43.79 | 1.21 +| | | G | 35.09 | 46.60 | 0.63 +| | | B | 34.36 | 43.78 | 1.18 | | | | | | -| | perc d | L | 34.69 | 48.01 | 0.37 -| | | R | 33.61 | 44.30 | 1.10 -| | | G | 34.26 | 47.46 | 0.53 -| | | B | 33.68 | 44.34 | 1.08 +| | perc d | L | 34.68 | 48.05 | 0.37 +| | | R | 33.62 | 44.33 | 1.10 +| | | G | 34.26 | 47.47 | 0.52 +| | | B | 33.68 | 44.37 | 1.07 | | | | | | | | | | | | | grass.png | fast | L | 24.91 | 34.82 | 2.29 @@ -309,7 +309,7 @@ | | | B | 19.67 | 24.68 | 8.26 | | | A | 18.04 | 27.78 | 5.11 | | | | | | -| | perc d | L | 24.98 | 34.87 | 2.25 +| | perc d | L | 24.97 | 34.88 | 2.25 | | | R | 17.59 | 22.58 | 10.50 | | | G | 19.09 | 24.13 | 8.71 | | | B | 19.67 | 24.69 | 8.24 @@ -340,16 +340,16 @@ | | | B | 23.70 | 29.29 | 3.20 | | | A | 21.75 | 35.14 | 1.44 | | | | | | -| | perc | L | 27.99 | 37.81 | 1.20 +| | perc | L | 27.99 | 37.82 | 1.20 | | | R | 17.88 | 23.58 | 6.79 -| | | G | 21.82 | 27.33 | 4.13 -| | | B | 23.66 | 29.15 | 3.30 +| | | G | 21.82 | 27.33 | 4.14 +| | | B | 23.66 | 29.15 | 3.31 | | | A | 22.56 | 32.99 | 2.15 | | | | | | | | perc d | L | 27.97 | 37.88 | 1.17 -| | | R | 17.88 | 23.58 | 6.74 +| | | R | 17.88 | 23.58 | 6.73 | | | G | 21.81 | 27.32 | 4.13 -| | | B | 23.66 | 29.15 | 3.28 +| | | B | 23.66 | 29.15 | 3.29 | | | A | 22.56 | 32.99 | 2.15 | | | | | | | | | | | | @@ -373,12 +373,12 @@ | | | G | 50.89 | 56.78 | 0.40 | | | B | 46.59 | 54.18 | 0.51 | | | | | | -| | perc | L | 57.92 | 60.65 | 0.26 -| | | R | 49.22 | 51.88 | 0.72 -| | | G | 53.91 | 56.64 | 0.41 -| | | B | 49.29 | 51.91 | 0.72 +| | perc | L | 57.93 | 60.65 | 0.26 +| | | R | 49.21 | 51.88 | 0.72 +| | | G | 53.86 | 56.60 | 0.42 +| | | B | 49.29 | 51.90 | 0.72 | | | | | | -| | perc d | L | 53.05 | 63.12 | 0.18 +| | perc d | L | 53.05 | 63.11 | 0.18 | | | R | 45.76 | 52.45 | 0.59 | | | G | 51.06 | 57.28 | 0.36 | | | B | 45.77 | 52.52 | 0.61 diff --git a/test-data/output-encode/compression/_hashes.yml b/test-data/output-encode/compression/_hashes.yml index 89890d3..e479d63 100644 --- a/test-data/output-encode/compression/_hashes.yml +++ b/test-data/output-encode/compression/_hashes.yml @@ -11,10 +11,10 @@ BC1_UNORM dither base.dds: > 3dd0b68c8d6b9120944900a918efdeb39601114472932cefc3d402e4a5ce91e9 BC1_UNORM perc base.dds: > - 68e4e63a4d78daa9126720adf2fde39e6d75aa6cd4a22805f98ed7036abb0826 + 7354180658b56762a26ac8a95c9531c47c6302039eb13e45a207e191a6dc4a97 BC1_UNORM perc d base.dds: > - 1e821761ab83f74561091fec256e9ecb6bb929afc3efe4322b945ad4761eab24 + 5db03c980e3719770498b9cf4c6a058142807edf6f8cf5a6d5eb7f38dac8c1aa BC1_UNORM fast color-twirl.dds: > 28639ab8850610eab834f48810db3470603cdf0e7adb18eff06b926aa6f845fb @@ -29,10 +29,10 @@ BC1_UNORM dither color-twirl.dds: > e1e400a3906882e0295fede43f089f22be4067fe9c1bfbaf470b38c78b0588fd BC1_UNORM perc color-twirl.dds: > - 800b5b99e6613f4a1845d83229444833bb3275d4ca134dc62240cf5be54d6de2 + 1d21d0edb087feb7af54d3090ba7a40f32d194e849cc9e3530fb1fd8eb89af4b BC1_UNORM perc d color-twirl.dds: > - ef2cd3c3ca2accd94221905a2e55941826f4cdb508668490f781c18c0a3deddd + b16e4218da085ebb9c207f1f210340926bb5764f06e9f7e4459ceefb7119ed34 BC1_UNORM fast bricks-d.dds: > 00c21c31f106b76e40cc6423d200c4c5b9f3fd720b74f90695b76ca34d35d4cc @@ -47,10 +47,10 @@ BC1_UNORM dither bricks-d.dds: > 37a3741a0d04ca331679f4e53eea4644f25abe4efc886984721b5fddb658f975 BC1_UNORM perc bricks-d.dds: > - b267383f0e1be497e1808e2e85f1a6c69ef3083fcfff75d0724713e85e444f39 + 239a8c2aa74d55834f76a662af8b927ba5d85b7c26c4c6bbe580ad1d42820bb7 BC1_UNORM perc d bricks-d.dds: > - dd49fb03f57806e70ad5c8a4b90be22772f944842be17bbdb60c7dac68aa25b6 + f2a5d1b6b087c44f7901f4409707c78f138be1c1104950892d45f169cf02f512 BC1_UNORM fast bricks-n.dds: > ea9f1d96d840fde687dddcb95d012dab8657cb412b18fd05e007a4316df41f92 @@ -65,10 +65,10 @@ BC1_UNORM dither bricks-n.dds: > 8f3738936abc0bab256be8e7cbdd43a7b9bc101b18e9008c86a71c811939f7dc BC1_UNORM perc bricks-n.dds: > - 1518d4f20076e987510715826871df095c93754e752ce03a9704d11ca61093f3 + ff6e0618a92815526fe3af7bac603f353d8dc1d6de11619eccb3f3d1e28482ed BC1_UNORM perc d bricks-n.dds: > - 7342c5025c177f51746e3fad8bb83cf16e4da61e3cf2e6db55030ba767b1508a + 8fe90f7a90da41eeead35ff4ed86db06126d6b3b19a08ddd588218227da05f81 BC1_UNORM fast clovers-d.dds: > e9a77c007eb11ec2b6b881da7565cbeb6e5fa3810e76933e7ca7986f7ab3db91 @@ -83,10 +83,10 @@ BC1_UNORM dither clovers-d.dds: > 09bf1bdbe387b5a7d35534573ef30967fca6cbfd6b98288b69aae21ebd960901 BC1_UNORM perc clovers-d.dds: > - 9bad727142926ecff3137a524fc7f4e587e1b79ddb5ee33303a4df8fb5e86b48 + 4b20672801f76f6d851640c940f305dbaea53a3a6b5f85c3220dbedd450426ea BC1_UNORM perc d clovers-d.dds: > - e80c37ef80167e13647b53b2dec9c53f58f90d163b17ccd40faa8b9358cafceb + a0a9cde64cb8a2e88d4f9146d4df12eaa53571c8877d5e8f3eb148b5e86ac712 BC1_UNORM fast clovers-r.dds: > 57074fc8d5c7e5a868f02b3de502c70974ad94d3f95f1385fa8d56c59d0e54c1 @@ -101,10 +101,10 @@ BC1_UNORM dither clovers-r.dds: > a17abee0d6f22400bb3e1d8cfc1fec0229c465e1552feca66f1507a7b3e0a5fd BC1_UNORM perc clovers-r.dds: > - 633c157f445737ac4bda016848168bf29812df9bafbcc36cb4cf06c8c02c19f7 + 030edc591f22e15381b981735665aa6445cf4ba4e3ac1a192dcbc42df3a96daf BC1_UNORM perc d clovers-r.dds: > - e7e1970f7f7ad06a70069cabea5f8b8377ad1fa37c82a8373fad85f3c4086d6b + e9e160b4dcc773f1d0b778683a2e9ab28e9f100ef7db09ab8f51ae284ec2fe21 BC1_UNORM fast stone-d.dds: > 137b9f6e61710d6631220bc781734a74d54acff8ca4a6ba095e5a6220e7ca4c5 @@ -119,10 +119,10 @@ BC1_UNORM dither stone-d.dds: > 0d8f808a3078ab97f1288dfc180686bfb3bc260858cba5f885bf9b30a5353b8c BC1_UNORM perc stone-d.dds: > - e7e756a66fcf8681796f43882d788c5223a64f03fd18440b8d482446f3c344c4 + 45277ce85c6ec6f1c521a4af5f61e9022d745c96faca46a9088cbe0e86576c85 BC1_UNORM perc d stone-d.dds: > - 470f069d19e5a4b86d698c993ea9e3adef80ccf392a8fd2f28749b3fc701b370 + 2448debadf1edd1dc4188541460f8d6ad86207a801fe3be1c56eb2418f29f168 BC1_UNORM fast grass.dds: > bdc8e999300e111012631dc4614f3eed61475895a7c7d7879ba66af5985db8a2 @@ -137,10 +137,10 @@ BC1_UNORM dither grass.dds: > 535b51a248e4f70722571dbd439c453a32566e17f26f264c6a9623a32f381e2c BC1_UNORM perc grass.dds: > - 7a5f9ddc728f73670a1fb008369a75a0e70c37e0a0ae38214e7681f97030d24b + 62586ce86225cdd89bfede49d21624732551289b786dbd8a8389c88557247be3 BC1_UNORM perc d grass.dds: > - 0de89fb78b7fcc367d88bba233f6abac5590b6c28ba610f31347960664399526 + 2cb0464d6bdaca85368d99652a0cf9fb7c2853071f5dc1996981a55ffff70ba4 BC1_UNORM fast leaves.dds: > c690e1c6747aa91904c75b781422bd785ac702f180207c535421fd4fec6c5108 @@ -155,10 +155,10 @@ BC1_UNORM dither leaves.dds: > 9d91abfc5115c201b977b13575b2548a76c081b06dacdcc79609275ce11e1629 BC1_UNORM perc leaves.dds: > - 837af17fe2df98e88eace6d3a610ea1848683ef372c750d8779c9a5b1e18e139 + e0e7d6550bc7bfa285286d52f410a14787c48f9dd2998a0f59f35cfb95dd7ceb BC1_UNORM perc d leaves.dds: > - 94bb3175e555c0385ccbbe87e92a95ad4969fa2a84ed836694fd427a9c25b9e5 + c73694efabe71aaeddba3290cacf01eac4a4da4d93a93ad9d6f6dd89802cb35b BC1_UNORM fast random color.dds: > 6f8dc5ffe90e606ce651a8bc99db577e0fba4c25dd28ac79d3d2b4e340204470 @@ -173,10 +173,10 @@ BC1_UNORM dither random color.dds: > d7b56b8f822b1da55dc376b318425fc8a30474f74dc07b24de3111d649849fbc BC1_UNORM perc random color.dds: > - edd64ae272e674f6eed55af47c168768794d0f029e96fff2c200968b7fc293ec + 974bc95b93d379fb30abc517aef60823ff815e05033536f153d2fdc938905d0a BC1_UNORM perc d random color.dds: > - 42040b1e00f8720084134104afe6ca248c3ee81c7651496a2def8e860a826351 + 2ac37f74016a8cfee3e31ad1adc108c3a9a797676314e0f02108b91997f376df BC4_UNORM fast base.dds: > 5a1e1cb4b8219ce9bbd98458103c976a43649c039562ea541dd0989a8c561b19 From fe478bd258702b3c60c1e44ccb76cdad330e326c Mon Sep 17 00:00:00 2001 From: RunDevelopment Date: Sun, 21 Sep 2025 13:08:30 +0200 Subject: [PATCH 4/8] Faster `cbrt` approximation --- src/color/oklab.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index 8669135..435691f 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -76,23 +76,22 @@ impl Operations for Fast { ) } fn cbrt(x: Vec3A) -> Vec3A { - // This is the fast cbrt approximation from the oklab crate. - // Source: https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61 - // Author: Kornel (https://gitlab.com/kornelski/) + // This is the fast cbrt approximation inspired by the non-std cbrt + // implementation (https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61) + // in the oklab crate by Kornel (https://gitlab.com/kornelski/), which + // in turn seems to be based on the libm implementation. + // In this version, I replaced the part after the initial guess with + // one Halley iteration. This reduces accuracy, but saves 2 divisions + // which helps performance a lot. const B: u32 = 709957561; - const C: f32 = 5.4285717010e-1; - const D: f32 = -7.0530611277e-1; - const E: f32 = 1.4142856598e+0; - const F: f32 = 1.6071428061e+0; - const G: f32 = 3.5714286566e-1; - - let mut t = Vec3A::from_array( + let t = Vec3A::from_array( x.to_array() .map(|x| f32::from_bits((x.to_bits() / 3).wrapping_add(B))), ); - let s = C + (t * t) * (t / x); - t *= G + F / (s + E + D / s); - t + + // one halley iteration + let s = t * t * t; + t * (s + 2.0 * x) / (2.0 * s + x) } } @@ -270,6 +269,13 @@ mod tests { "Error: avg=0.00105457 max=0.00236702 for 0.732" ); } + #[test] + fn test_error_fast_cbrt() { + assert_eq!( + get_error_stats(RefScalar::cbrt, FastScalar::cbrt), + "Error: avg=0.00000283 max=0.00001299 for 0.250" + ); + } fn get_error_stats(f1: impl Fn(f32) -> f32, f2: impl Fn(f32) -> f32) -> String { let count = 1000; From adb0469328204efe9c73d4e95661c5fa780f86a6 Mon Sep 17 00:00:00 2001 From: RunDevelopment Date: Sun, 21 Sep 2025 14:16:31 +0200 Subject: [PATCH 5/8] Use multiply+shift for /3 --- src/color/oklab.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index 435691f..f01e14e 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -84,10 +84,15 @@ impl Operations for Fast { // one Halley iteration. This reduces accuracy, but saves 2 divisions // which helps performance a lot. const B: u32 = 709957561; - let t = Vec3A::from_array( - x.to_array() - .map(|x| f32::from_bits((x.to_bits() / 3).wrapping_add(B))), - ); + fn initial_guess(x: f32) -> f32 { + let bits = x.to_bits(); + // divide by 3 using multiplication and bitshift + // this is only correct if bits <= 2^31, which is true for all + // positive f32 values + let div = ((bits as u64 * 1431655766) >> 32) as u32; + f32::from_bits(div + B) + } + let t = Vec3A::from_array(x.to_array().map(initial_guess)); // one halley iteration let s = t * t * t; From 7b2b523d70c77083d44e9f54b2f936a20120cb86 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Mon, 29 Sep 2025 19:12:16 +0200 Subject: [PATCH 6/8] FMA + more optimal polynomials --- src/color/oklab.rs | 75 ++++++++++++++++--- test-data/encode_quality.md | 6 +- .../output-encode/compression/_hashes.yml | 28 +++---- 3 files changed, 80 insertions(+), 29 deletions(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index f01e14e..9424d95 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -43,6 +43,32 @@ impl Operations for Reference { } } +/// A fast fused multiply-add operation that uses hardware FMA if available. +/// If hardware FMA is not available, it falls back to a regular multiply-add. +#[inline(always)] +fn fma(a: Vec3A, b: Vec3A, c: Vec3A) -> Vec3A { + #[cfg(any( + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "fma" + ), + target_arch = "aarch64" + ))] + { + a.mul_add(b, c) + } + #[cfg(not(any( + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "fma" + ), + target_arch = "aarch64" + )))] + { + a * b + c + } +} + struct Fast; #[allow(clippy::excessive_precision)] impl Operations for Fast { @@ -52,11 +78,16 @@ impl Operations for Fast { { // Polynomial approximation for ((c + 0.055) / 1.055) ^ 2.4 // This has a max error of 0.0001228 and is exact at c=0.04045 and c=1 + const A0: f32 = 0.00117465; + const A1: f32 = 0.02381997; + const A2: f32 = 0.58750746; + const A3: f32 = 0.47736490; + const A4: f32 = -0.08986699; let c2 = c * c; - let c3 = c2 * c; - let c4 = c2 * c2; - - 0.00117465 + 0.02381997 * c + 0.58750746 * c2 + 0.47736490 * c3 + -0.08986699 * c4 + let p01 = fma(c, Vec3A::splat(A1), Vec3A::splat(A0)); + let p23 = fma(c, Vec3A::splat(A3), Vec3A::splat(A2)); + let t = fma(c2, Vec3A::splat(A4), p23); + fma(c2, t, p01) }, c * (1.0 / 12.92), ) @@ -67,10 +98,21 @@ impl Operations for Fast { { // This uses a Padé approximant for 1.055 c^(1/2.4) - 0.055: // (-0.0117264+21.0897 x+949.46 x^2+2225.62 x^3)/(1+176.398 x+1983.15 x^2+1035.65 x^3) + const P0: f32 = -0.0117264; + const P1: f32 = 21.0897; + const P2: f32 = 949.46; + const P3: f32 = 2225.62; + const Q1: f32 = 176.398; + const Q2: f32 = 1983.15; + const Q3: f32 = 1035.65; let c2 = c * c; - let c3 = c2 * c; - (-0.0117264 + 21.0897 * c + 949.46 * c2 + 2225.62 * c3) - / (1.0 + 176.398 * c + 1983.15 * c2 + 1035.65 * c3) + let p01 = fma(c, Vec3A::splat(P1), Vec3A::splat(P0)); + let p23 = fma(c, Vec3A::splat(P3), Vec3A::splat(P2)); + let p = fma(c2, p23, p01); + let q01 = fma(c, Vec3A::splat(Q1), Vec3A::ONE); + let q23 = fma(c, Vec3A::splat(Q3), Vec3A::splat(Q2)); + let q = fma(c2, q23, q01); + p / q }, c * 12.92, ) @@ -135,7 +177,8 @@ fn oklab_to_srgb_impl(lab: Vec3A) -> Vec3A { lms.dot(Vec3A::new(-0.0041960863, -0.7034186147, 1.7076147010)), ); - O::linear_to_srgb(rgb) + // the clamping is necessary for out-of-gamut colors + O::linear_to_srgb(rgb).clamp(Vec3A::ZERO, Vec3A::ONE) } #[allow(unused)] @@ -195,14 +238,14 @@ mod tests { let ref_oklab = srgb_to_oklab(color); assert!( - (fast_oklab - ref_oklab).abs().max_element() < 1e-3, + (fast_oklab - ref_oklab).abs().max_element() < 0.001, "{color:?} -> fast: {fast_oklab:?} vs ref: {ref_oklab:?}" ); let srgb = fast_oklab_to_srgb(fast_oklab); assert!( - (color - srgb).abs().max_element() < 2.5e-3, + (color - srgb).abs().max_element() < 0.0025, "{color:?} -> {srgb:?}" ); @@ -214,6 +257,14 @@ mod tests { fast_oklab.min_element() >= 0.0, "{color:?} -> {fast_oklab:?}" ); + assert!( + srgb.max_element() <= 1.0, + "{color:?} -> {fast_oklab:?} -> {srgb:?}" + ); + assert!( + srgb.min_element() >= 0.0, + "{color:?} -> {fast_oklab:?} -> {srgb:?}" + ); } } } @@ -264,14 +315,14 @@ mod tests { fn test_error_fast_srgb_to_linear() { assert_eq!( get_error_stats(RefScalar::srgb_to_linear, FastScalar::srgb_to_linear), - "Error: avg=0.00007546 max=0.00012285 for 0.637" + "Error: avg=0.00007546 max=0.00012287 for 0.641" ); } #[test] fn test_error_fast_linear_to_srgb() { assert_eq!( get_error_stats(RefScalar::linear_to_srgb, FastScalar::linear_to_srgb), - "Error: avg=0.00105457 max=0.00236702 for 0.732" + "Error: avg=0.00105456 max=0.00236708 for 0.730" ); } #[test] diff --git a/test-data/encode_quality.md b/test-data/encode_quality.md index 36e3565..8b810ac 100644 --- a/test-data/encode_quality.md +++ b/test-data/encode_quality.md @@ -213,7 +213,7 @@ | | | | | | | | perc d | L | 34.97 | 49.75 | 0.31 | | | R | 30.11 | 36.64 | 2.27 -| | | G | 32.88 | 47.38 | 0.43 +| | | G | 32.87 | 47.39 | 0.43 | | | B | 31.72 | 38.05 | 2.26 | | | | | | | | | | | | @@ -242,7 +242,7 @@ | | | G | 32.83 | 45.17 | 0.65 | | | B | 32.51 | 43.68 | 1.07 | | | | | | -| | perc d | L | 33.80 | 49.06 | 0.31 +| | perc d | L | 33.80 | 49.07 | 0.31 | | | R | 31.68 | 45.39 | 0.82 | | | G | 31.97 | 46.91 | 0.49 | | | B | 31.75 | 45.66 | 0.77 @@ -375,7 +375,7 @@ | | | | | | | | perc | L | 57.93 | 60.65 | 0.26 | | | R | 49.21 | 51.88 | 0.72 -| | | G | 53.86 | 56.60 | 0.42 +| | | G | 53.86 | 56.60 | 0.41 | | | B | 49.29 | 51.90 | 0.72 | | | | | | | | perc d | L | 53.18 | 63.08 | 0.18 diff --git a/test-data/output-encode/compression/_hashes.yml b/test-data/output-encode/compression/_hashes.yml index 6f787b0..5d5d33d 100644 --- a/test-data/output-encode/compression/_hashes.yml +++ b/test-data/output-encode/compression/_hashes.yml @@ -32,7 +32,7 @@ BC1_UNORM perc color-twirl.dds: > e420da281f89093fc89537b143757364ed08d7a604047d756805effcdcce8092 BC1_UNORM perc d color-twirl.dds: > - 5c71f2e5c610b1efb15d785db14b1ba8e67a612a7eaaaaf49fb5e3b61088d00d + 23aee1fb31cc4c4ac0ddc4ab3d5511761c152644ab1931be3ac10c0befa2d904 BC1_UNORM fast bricks-d.dds: > 407b1b63a944454ee57142fad0c5ea788337c63f5973385f00aedfd8aa316879 @@ -47,7 +47,7 @@ BC1_UNORM dither bricks-d.dds: > a184e049d54f37e2eea90e6563b8b802a6d069ac99fc9edf933684b41ca174bd BC1_UNORM perc bricks-d.dds: > - 350dcf662f0751180a96b27c8ee426458fabc68d78f8e08a9ab49c6ecc151018 + dcb9eda79633eeb9bc25e7c12204afa87f2f4dd1d593c7f7a5994a0355eafa07 BC1_UNORM perc d bricks-d.dds: > 695556967dd11b920479883930a28cf93a41fcf40cdb1338947737da843d5cd9 @@ -65,10 +65,10 @@ BC1_UNORM dither bricks-n.dds: > 5da3a86c92e897b60ba17eafef033a679c3cf1a1f6210282ccf21721f82e5ae6 BC1_UNORM perc bricks-n.dds: > - b8d7e2a4c70f5220e24b90ff94977e1f6e5edb07e347169335fd3e9c747773d8 + 38ddf81722f9b00594bd95c225930ad7d91a9e0ee2f1a2c3ec59e0f4c35bef20 BC1_UNORM perc d bricks-n.dds: > - 1a9f0e9527d0393d0fd36a042d0789511d3bab9a9267698537d23798f28a34a9 + 1f45fec6d3437ee8d4c57500e197d0f9f49afcb31ce99c94a002c6ba709dbb98 BC1_UNORM fast clovers-d.dds: > 31709e578f4ebe14fc497e027ad3601684b749a4ff7761557737705156218a6b @@ -83,10 +83,10 @@ BC1_UNORM dither clovers-d.dds: > 9e82d8c1b3d8c25a39d0b02801014da5aacfbcf2d74cf65a6cb46e567e33bb00 BC1_UNORM perc clovers-d.dds: > - 2143addddadc0d2a8b37910c7d7a7215425169731ffd82d90da2137a3a9bf582 + ebd1c53becdbfca3eacb2c8091889e90a0c8eec891bdc51c72e8f77bcfa025fc BC1_UNORM perc d clovers-d.dds: > - eb0346cc9b3bf88d0ca4d045b02dcd1e01c4e42ba160077617460882d50062c1 + fc454155851270165f02d8338c47226b2238c992b2cf0cd59eb1d314a8804f07 BC1_UNORM fast clovers-r.dds: > 797e217227149e775d8ee5d55778e7c38111bab9c38ef2a594392982ced3ee72 @@ -101,10 +101,10 @@ BC1_UNORM dither clovers-r.dds: > ac6240507843f1dd4af4ba2ae101d5bf9eab6ea55027f0837cebab52443ca12a BC1_UNORM perc clovers-r.dds: > - 8b2f560c41971d10c575e075593ff2b094c0114be0fdcaefbca784577afbd1f9 + 6179b8b9dd592432b56c8feae3438f1839767d1a5acbffc544257cc259d0cf7f BC1_UNORM perc d clovers-r.dds: > - deb6441a61883d160a0cbd6576ea9bacc500997834b239577b983514fc5091a9 + 779567e7af628c296086112fffcf74fbe928f040f0384fee5049b4c00ba21ee4 BC1_UNORM fast stone-d.dds: > 3b0fa35baf5608d317dc97e5aedf4bf5152afa951068d66fb14727f5e85778ae @@ -119,10 +119,10 @@ BC1_UNORM dither stone-d.dds: > e992f7febd885bdb317f28da576e82091c03b013af71f01f78e6098f310bfa29 BC1_UNORM perc stone-d.dds: > - b0cd67a4ce01e1974e5ef0aa8ddf28a754df42af9ebc718cf9c812ef49d263b8 + 0e2707438d9157b4061a10efdbb5666120741bc0c6b02ad22ff6db243aad7871 BC1_UNORM perc d stone-d.dds: > - f4ab675798f60801f7ab14374cf4de59fb972af4cc611413984a5ca4291a7279 + 6353617123ce724c7079b129fc0d0d2820f67c7f7e78d95a688dbba2cd64529d BC1_UNORM fast grass.dds: > c24d7b91247e7cc3dd1fd913827a20e16dedbff46bcc5dbb506ffd9e7dc38c61 @@ -137,10 +137,10 @@ BC1_UNORM dither grass.dds: > 5a70910ead1f6266fbf4ae4a96f7557d813994b5512f22dfc28bd1124dab5377 BC1_UNORM perc grass.dds: > - fafef65714f812f399144ee45bb9bbf371c75e296e15ca5c1c93e9e173ac350b + 75dfed97264a19f8860c7089d7723aa042c48757b6589d31587bca1a7d26137c BC1_UNORM perc d grass.dds: > - e5b1f540d88e563c019fecdd898893dfb8fdee59b001e3c3be94bea2d36b9c77 + f0f0930b72beb9f1018d57ce54433cb0b33449cf6d53d95e21f74a7f18d56f72 BC1_UNORM fast leaves.dds: > cbe0da225eb6fdf655997b616cce5e642eb44b7da4461c4e441b095a6a298c98 @@ -173,10 +173,10 @@ BC1_UNORM dither random color.dds: > 2fe0aa2ed8b19cfb31eea2a1d253e4d79f807b0c41daf96be617484dd31b0f4e BC1_UNORM perc random color.dds: > - 0225aacda329a21b50e4bb0aa38d59779b5549528a2e463684d854fbccb52efa + e1c12e6344ebf14ba05928c8c80231dddb6d7fd5e673dcdfc8c71b557c66103f BC1_UNORM perc d random color.dds: > - 1391e0f102f8a2e653484b94dec0f1bcdae1b55853d03cc619e14213e6140334 + 6361880a24373816db573388f9ae3a47d5af5dd3ffebb3f6ec84571f5cc8bb45 BC4_UNORM fast base.dds: > 5a1e1cb4b8219ce9bbd98458103c976a43649c039562ea541dd0989a8c561b19 From c38bfa68cf0e494276c585b4d6c43c70da0918f9 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Mon, 29 Sep 2025 22:36:55 +0200 Subject: [PATCH 7/8] Trigger CI From 2f4ffa1caa7b82523d75b01bf62c7f95ae829261 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Tue, 30 Sep 2025 10:58:20 +0200 Subject: [PATCH 8/8] Use FMA for the Halley iteration --- src/color/oklab.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/color/oklab.rs b/src/color/oklab.rs index 9424d95..1e30b5c 100644 --- a/src/color/oklab.rs +++ b/src/color/oklab.rs @@ -138,7 +138,7 @@ impl Operations for Fast { // one halley iteration let s = t * t * t; - t * (s + 2.0 * x) / (2.0 * s + x) + t * fma(Vec3A::splat(2.0), x, s) / fma(Vec3A::splat(2.0), s, x) } }