From 8ca7cd0ef7b5c62ce8d0656e7c372e7a79ac0b66 Mon Sep 17 00:00:00 2001 From: Henrik Date: Thu, 11 Jan 2024 00:07:12 +0100 Subject: [PATCH 1/5] Use complex mul-acc --- src/lib.rs | 1 + src/neon/neon_utils.rs | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 88f48914..7277c1f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![cfg_attr(all(feature = "bench", test), feature(test))] +#![feature(stdsimd)] //! RustFFT is a high-performance FFT library written in pure Rust. //! diff --git a/src/neon/neon_utils.rs b/src/neon/neon_utils.rs index 19b7b899..def12231 100644 --- a/src/neon/neon_utils.rs +++ b/src/neon/neon_utils.rs @@ -176,13 +176,12 @@ pub unsafe fn transpose_complex_2x2_f32(left: float32x4_t, right: float32x4_t) - #[inline(always)] pub unsafe fn mul_complex_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { // ARMv8.2-A introduced vcmulq_f32 and vcmlaq_f32 for complex multiplication, these intrinsics are not yet available. - let temp1 = vtrn1q_f32(right, right); - let temp2 = vtrn2q_f32(right, vnegq_f32(right)); - let temp3 = vmulq_f32(temp2, left); - let temp4 = vrev64q_f32(temp3); - vfmaq_f32(temp4, temp1, left) + let temp = vmovq_n_f32(0.0); + let step1 = vcmlaq_f32(temp, left, right); + vcmlaq_rot90_f32(step1, left, right) } + // __ __ _ _ __ _ _ _ _ _ // | \/ | __ _| |_| |__ / /_ | || | | |__ (_) |_ // | |\/| |/ _` | __| '_ \ _____ | '_ \| || |_| '_ \| | __| @@ -219,9 +218,12 @@ impl Rotate90F64 { #[inline(always)] pub unsafe fn mul_complex_f64(left: float64x2_t, right: float64x2_t) -> float64x2_t { // ARMv8.2-A introduced vcmulq_f64 and vcmlaq_f64 for complex multiplication, these intrinsics are not yet available. - let temp = vcombine_f64(vneg_f64(vget_high_f64(left)), vget_low_f64(left)); - let sum = vmulq_laneq_f64::<0>(left, right); - vfmaq_laneq_f64::<1>(sum, temp, right) + //let temp = vcombine_f64(vneg_f64(vget_high_f64(left)), vget_low_f64(left)); + //let sum = vmulq_laneq_f64::<0>(left, right); + //vfmaq_laneq_f64::<1>(sum, temp, right) + let temp = vmovq_n_f64(0.0); + let step1 = vcmlaq_f64(temp, left, right); + vcmlaq_rot90_f64(step1, left, right) } #[cfg(test)] From c309cac059f0aa1c64380acfc8ed2ad838cc9a19 Mon Sep 17 00:00:00 2001 From: Henrik Date: Sat, 13 Jan 2024 11:15:34 +0100 Subject: [PATCH 2/5] WIP faster rotations --- src/neon/neon_butterflies.rs | 2 +- src/neon/neon_utils.rs | 78 +++++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs index 4c77e731..c34dcb17 100644 --- a/src/neon/neon_butterflies.rs +++ b/src/neon/neon_butterflies.rs @@ -1353,7 +1353,7 @@ impl NeonF32Butterfly8 { let mut val2 = self.bf4.perform_fft_direct(in13, in57); // step 3: apply twiddle factors - let val2b = self.rotate90.rotate_hi(val2[0]); + let val2b = self.rotate90.rotate_both(val2[0]); // same result as rotate_hi let val2c = vaddq_f32(val2b, val2[0]); let val2d = vmulq_f32(val2c, self.root2); val2[0] = extract_lo_hi_f32(val2[0], val2d); diff --git a/src/neon/neon_utils.rs b/src/neon/neon_utils.rs index def12231..9fa3f689 100644 --- a/src/neon/neon_utils.rs +++ b/src/neon/neon_utils.rs @@ -33,9 +33,9 @@ impl Rotate90F32 { }; let sign_both = unsafe { if positive { - vld1q_f32([-0.0, 0.0, -0.0, 0.0].as_ptr()) + vmovq_n_f32(1.0) } else { - vld1q_f32([0.0, -0.0, 0.0, -0.0].as_ptr()) + vmovq_n_f32(-1.0) } }; Self { @@ -65,11 +65,8 @@ impl Rotate90F32 { #[inline(always)] pub unsafe fn rotate_both(&self, values: float32x4_t) -> float32x4_t { - let temp = vrev64q_f32(values); - vreinterpretq_f32_u32(veorq_u32( - vreinterpretq_u32_f32(temp), - vreinterpretq_u32_f32(self.sign_both), - )) + let temp = vmovq_n_f32(0.0); + vcmlaq_rot90_f32(temp, self.sign_both, values) } } @@ -197,9 +194,9 @@ impl Rotate90F64 { pub fn new(positive: bool) -> Self { let sign = unsafe { if positive { - vld1q_f64([-0.0, 0.0].as_ptr()) + vmovq_n_f64(1.0) } else { - vld1q_f64([0.0, -0.0].as_ptr()) + vmovq_n_f64(-1.0) } }; Self { sign } @@ -207,11 +204,8 @@ impl Rotate90F64 { #[inline(always)] pub unsafe fn rotate(&self, values: float64x2_t) -> float64x2_t { - let temp = vcombine_f64(vget_high_f64(values), vget_low_f64(values)); - vreinterpretq_f64_u64(veorq_u64( - vreinterpretq_u64_f64(temp), - vreinterpretq_u64_f64(self.sign), - )) + let temp = vmovq_n_f64(0.0); + vcmlaq_rot90_f64(temp, self.sign, values) } } @@ -277,4 +271,60 @@ mod unit_tests { assert_eq!(second, second_expected); } } + + #[test] + fn test_rotate_both_pos_32() { + unsafe { + let rotp = Rotate90F32::new(true); + let nbr = vld1q_f32([1.0, 2.0, 3.0, 4.0].as_ptr()); + let pos = rotp.rotate_both(nbr); + let result = std::mem::transmute::; 2]>(pos); + let expected = [Complex::new(-2.0, 1.0), Complex::new(-4.0, 3.0)]; + assert_eq!(result, expected); + } + } + + #[test] + fn test_rotate_both_neg_32() { + unsafe { + let rotp = Rotate90F32::new(false); + let nbr = vld1q_f32([1.0, 2.0, 3.0, 4.0].as_ptr()); + let neg = rotp.rotate_both(nbr); + let result = std::mem::transmute::; 2]>(neg); + let expected = [Complex::new(2.0, -1.0), Complex::new(4.0, -3.0)]; + assert_eq!(result, expected); + } + } + + #[test] + fn test_rotate_single_pos_32() { + unsafe { + let rotp = Rotate90F32::new(true); + let nbr = vld1q_f32([1.0, 2.0, 3.0, 4.0].as_ptr()); + let hi = rotp.rotate_hi(nbr); + // let lo = rotp.rotate_lo(nbr); + let result_hi = std::mem::transmute::; 2]>(hi); + //let result_lo = std::mem::transmute::; 2]>(hi); + let expected_hi = [Complex::new(1.0, 2.0), Complex::new(-4.0, 3.0)]; + //let expected_lo: [Complex; 2] = [Complex::new(2.0, -1.0), Complex::new(3.0, 4.0)]; + assert_eq!(result_hi, expected_hi); + //assert_eq!(result_lo, expected_lo); + } + } + + #[test] + fn test_rotate_single_neg_32() { + unsafe { + let rotp = Rotate90F32::new(false); + let nbr = vld1q_f32([1.0, 2.0, 3.0, 4.0].as_ptr()); + let hi = rotp.rotate_hi(nbr); + // let lo = rotp.rotate_lo(nbr); + let result_hi = std::mem::transmute::; 2]>(hi); + //let result_lo = std::mem::transmute::; 2]>(hi); + let expected_hi = [Complex::new(1.0, 2.0), Complex::new(4.0, -3.0)]; + //let expected_lo: [Complex; 2] = [Complex::new(2.0, -1.0), Complex::new(3.0, 4.0)]; + assert_eq!(result_hi, expected_hi); + //assert_eq!(result_lo, expected_lo); + } + } } From 0f2c790ec4782c063471c203f06e30346ae221a0 Mon Sep 17 00:00:00 2001 From: Henrik Date: Sat, 13 Jan 2024 21:27:28 +0100 Subject: [PATCH 3/5] Use combined rotate and add or subtract --- src/neon/neon_butterflies.rs | 272 +++++++++++++++++------------------ src/neon/neon_utils.rs | 24 +++- 2 files changed, 153 insertions(+), 143 deletions(-) diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs index c34dcb17..9ac15cba 100644 --- a/src/neon/neon_butterflies.rs +++ b/src/neon/neon_butterflies.rs @@ -966,10 +966,10 @@ impl NeonF32Butterfly5 { [ vaddq_f32(value0, vaddq_f32(x14p, x23p)), - vaddq_f32(temp_a1, self.rotate.rotate_both(temp_b1)), - vaddq_f32(temp_a2, self.rotate.rotate_both(temp_b2)), - vsubq_f32(temp_a2, self.rotate.rotate_both(temp_b2)), - vsubq_f32(temp_a1, self.rotate.rotate_both(temp_b1)), + self.rotate.rotate_both_and_add(temp_a1, temp_b1), + self.rotate.rotate_both_and_add(temp_a2, temp_b2), + self.rotate.rotate_both_and_sub(temp_a2, temp_b2), + self.rotate.rotate_both_and_sub(temp_a1, temp_b1), ] } } @@ -1068,14 +1068,12 @@ impl NeonF64Butterfly5 { let temp_b1 = vaddq_f64(temp_b1_1, temp_b1_2); let temp_b2 = vsubq_f64(temp_b2_1, temp_b2_2); - let temp_b1_rot = self.rotate.rotate(temp_b1); - let temp_b2_rot = self.rotate.rotate(temp_b2); [ vaddq_f64(value0, vaddq_f64(x14p, x23p)), - vaddq_f64(temp_a1, temp_b1_rot), - vaddq_f64(temp_a2, temp_b2_rot), - vsubq_f64(temp_a2, temp_b2_rot), - vsubq_f64(temp_a1, temp_b1_rot), + self.rotate.rotate_and_add(temp_a1, temp_b1), + self.rotate.rotate_and_add(temp_a2, temp_b2), + self.rotate.rotate_and_sub(temp_a2, temp_b2), + self.rotate.rotate_and_sub(temp_a1, temp_b1), ] } } @@ -1353,8 +1351,9 @@ impl NeonF32Butterfly8 { let mut val2 = self.bf4.perform_fft_direct(in13, in57); // step 3: apply twiddle factors - let val2b = self.rotate90.rotate_both(val2[0]); // same result as rotate_hi - let val2c = vaddq_f32(val2b, val2[0]); + //let val2b = self.rotate90.rotate_both(val2[0]); // same result as rotate_hi + //let val2c = vaddq_f32(val2b, val2[0]); + let val2c = self.rotate90.rotate_both_and_add(val2[0], val2[0]); let val2d = vmulq_f32(val2c, self.root2); val2[0] = extract_lo_hi_f32(val2[0], val2d); @@ -1387,8 +1386,9 @@ impl NeonF32Butterfly8 { .perform_parallel_fft_direct(values[1], values[3], values[5], values[7]); // step 3: apply twiddle factors - let val5b = self.rotate90.rotate_both(val47[1]); - let val5c = vaddq_f32(val5b, val47[1]); + //let val5b = self.rotate90.rotate_both(val47[1]); + //let val5c = vaddq_f32(val5b, val47[1]); + let val5c = self.rotate90.rotate_both_and_add(val47[1], val47[1]); val47[1] = vmulq_f32(val5c, self.root2_dual); val47[2] = self.rotate90.rotate_both(val47[2]); let val7b = self.rotate90.rotate_both(val47[3]); @@ -1470,8 +1470,9 @@ impl NeonF64Butterfly8 { .perform_fft_direct(values[1], values[3], values[5], values[7]); // step 3: apply twiddle factors - let val5b = self.rotate90.rotate(val47[1]); - let val5c = vaddq_f64(val5b, val47[1]); + //let val5b = self.rotate90.rotate(val47[1]); + //let val5c = vaddq_f64(val5b, val47[1]); + let val5c = self.rotate90.rotate_and_add(val47[1], val47[1]); val47[1] = vmulq_f64(val5c, self.root2); val47[2] = self.rotate90.rotate(val47[2]); let val7b = self.rotate90.rotate(val47[3]); @@ -2428,23 +2429,23 @@ impl NeonF32Butterfly16 { odds3[1] = mul_complex_f32(odds3[1], self.twiddle23conj); // step 4: cross FFTs - let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); - let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); // apply the butterfly 4 twiddle factor, which is just a rotation - temp0[1] = self.rotate90.rotate_both(temp0[1]); - temp1[1] = self.rotate90.rotate_both(temp1[1]); + //temp0[1] = self.rotate90.rotate_both(temp0[1]); + //temp1[1] = self.rotate90.rotate_both(temp1[1]); //step 5: copy/add/subtract data back to buffer [ vaddq_f32(evens[0], temp0[0]), vaddq_f32(evens[1], temp1[0]), - vaddq_f32(evens[2], temp0[1]), - vaddq_f32(evens[3], temp1[1]), + self.rotate90.rotate_both_and_add(evens[2], temp0[1]), + self.rotate90.rotate_both_and_add(evens[3], temp1[1]), vsubq_f32(evens[0], temp0[0]), vsubq_f32(evens[1], temp1[0]), - vsubq_f32(evens[2], temp0[1]), - vsubq_f32(evens[3], temp1[1]), + self.rotate90.rotate_both_and_sub(evens[2], temp0[1]), + self.rotate90.rotate_both_and_sub(evens[3], temp1[1]), ] } @@ -2475,16 +2476,16 @@ impl NeonF32Butterfly16 { odds3[3] = mul_complex_f32(odds3[3], self.twiddle3c); // step 4: cross FFTs - let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); - let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); - let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); - let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); + let temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); + let temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); // apply the butterfly 4 twiddle factor, which is just a rotation - temp0[1] = self.rotate90.rotate_both(temp0[1]); - temp1[1] = self.rotate90.rotate_both(temp1[1]); - temp2[1] = self.rotate90.rotate_both(temp2[1]); - temp3[1] = self.rotate90.rotate_both(temp3[1]); + //temp0[1] = self.rotate90.rotate_both(temp0[1]); + //temp1[1] = self.rotate90.rotate_both(temp1[1]); + //temp2[1] = self.rotate90.rotate_both(temp2[1]); + //temp3[1] = self.rotate90.rotate_both(temp3[1]); //step 5: copy/add/subtract data back to buffer [ @@ -2492,18 +2493,18 @@ impl NeonF32Butterfly16 { vaddq_f32(evens[1], temp1[0]), vaddq_f32(evens[2], temp2[0]), vaddq_f32(evens[3], temp3[0]), - vaddq_f32(evens[4], temp0[1]), - vaddq_f32(evens[5], temp1[1]), - vaddq_f32(evens[6], temp2[1]), - vaddq_f32(evens[7], temp3[1]), + self.rotate90.rotate_both_and_add(evens[4], temp0[1]), + self.rotate90.rotate_both_and_add(evens[5], temp1[1]), + self.rotate90.rotate_both_and_add(evens[6], temp2[1]), + self.rotate90.rotate_both_and_add(evens[7], temp3[1]), vsubq_f32(evens[0], temp0[0]), vsubq_f32(evens[1], temp1[0]), vsubq_f32(evens[2], temp2[0]), vsubq_f32(evens[3], temp3[0]), - vsubq_f32(evens[4], temp0[1]), - vsubq_f32(evens[5], temp1[1]), - vsubq_f32(evens[6], temp2[1]), - vsubq_f32(evens[7], temp3[1]), + self.rotate90.rotate_both_and_sub(evens[4], temp0[1]), + self.rotate90.rotate_both_and_sub(evens[5], temp1[1]), + self.rotate90.rotate_both_and_sub(evens[6], temp2[1]), + self.rotate90.rotate_both_and_sub(evens[7], temp3[1]), ] } } @@ -2623,35 +2624,30 @@ impl NeonF64Butterfly16 { odds3[3] = mul_complex_f64(odds3[3], self.twiddle3c); // step 4: cross FFTs - let mut temp0 = solo_fft2_f64(odds1[0], odds3[0]); - let mut temp1 = solo_fft2_f64(odds1[1], odds3[1]); - let mut temp2 = solo_fft2_f64(odds1[2], odds3[2]); - let mut temp3 = solo_fft2_f64(odds1[3], odds3[3]); + let temp0 = solo_fft2_f64(odds1[0], odds3[0]); + let temp1 = solo_fft2_f64(odds1[1], odds3[1]); + let temp2 = solo_fft2_f64(odds1[2], odds3[2]); + let temp3 = solo_fft2_f64(odds1[3], odds3[3]); - // apply the butterfly 4 twiddle factor, which is just a rotation - temp0[1] = self.rotate90.rotate(temp0[1]); - temp1[1] = self.rotate90.rotate(temp1[1]); - temp2[1] = self.rotate90.rotate(temp2[1]); - temp3[1] = self.rotate90.rotate(temp3[1]); - - //step 5: copy/add/subtract data back to buffer + // step 5: copy/add/subtract data back to buffer, + // and apply the butterfly 4 twiddle factor, which is just a rotation [ vaddq_f64(evens[0], temp0[0]), vaddq_f64(evens[1], temp1[0]), vaddq_f64(evens[2], temp2[0]), vaddq_f64(evens[3], temp3[0]), - vaddq_f64(evens[4], temp0[1]), - vaddq_f64(evens[5], temp1[1]), - vaddq_f64(evens[6], temp2[1]), - vaddq_f64(evens[7], temp3[1]), + self.rotate90.rotate_and_add(evens[4], temp0[1]), + self.rotate90.rotate_and_add(evens[5], temp1[1]), + self.rotate90.rotate_and_add(evens[6], temp2[1]), + self.rotate90.rotate_and_add(evens[7], temp3[1]), vsubq_f64(evens[0], temp0[0]), vsubq_f64(evens[1], temp1[0]), vsubq_f64(evens[2], temp2[0]), vsubq_f64(evens[3], temp3[0]), - vsubq_f64(evens[4], temp0[1]), - vsubq_f64(evens[5], temp1[1]), - vsubq_f64(evens[6], temp2[1]), - vsubq_f64(evens[7], temp3[1]), + self.rotate90.rotate_and_sub(evens[4], temp0[1]), + self.rotate90.rotate_and_sub(evens[5], temp1[1]), + self.rotate90.rotate_and_sub(evens[6], temp2[1]), + self.rotate90.rotate_and_sub(evens[7], temp3[1]), ] } } @@ -2842,16 +2838,16 @@ impl NeonF32Butterfly32 { odds3[3] = mul_complex_f32(odds3[3], self.twiddle67conj); // step 4: cross FFTs - let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); - let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); - let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); - let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); + let temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); + let temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); // apply the butterfly 4 twiddle factor, which is just a rotation - temp0[1] = self.rotate90.rotate_both(temp0[1]); - temp1[1] = self.rotate90.rotate_both(temp1[1]); - temp2[1] = self.rotate90.rotate_both(temp2[1]); - temp3[1] = self.rotate90.rotate_both(temp3[1]); + //temp0[1] = self.rotate90.rotate_both(temp0[1]); + //temp1[1] = self.rotate90.rotate_both(temp1[1]); + //temp2[1] = self.rotate90.rotate_both(temp2[1]); + //temp3[1] = self.rotate90.rotate_both(temp3[1]); //step 5: copy/add/subtract data back to buffer [ @@ -2859,18 +2855,18 @@ impl NeonF32Butterfly32 { vaddq_f32(evens[1], temp1[0]), vaddq_f32(evens[2], temp2[0]), vaddq_f32(evens[3], temp3[0]), - vaddq_f32(evens[4], temp0[1]), - vaddq_f32(evens[5], temp1[1]), - vaddq_f32(evens[6], temp2[1]), - vaddq_f32(evens[7], temp3[1]), + self.rotate90.rotate_both_and_add(evens[4], temp0[1]), + self.rotate90.rotate_both_and_add(evens[5], temp1[1]), + self.rotate90.rotate_both_and_add(evens[6], temp2[1]), + self.rotate90.rotate_both_and_add(evens[7], temp3[1]), vsubq_f32(evens[0], temp0[0]), vsubq_f32(evens[1], temp1[0]), vsubq_f32(evens[2], temp2[0]), vsubq_f32(evens[3], temp3[0]), - vsubq_f32(evens[4], temp0[1]), - vsubq_f32(evens[5], temp1[1]), - vsubq_f32(evens[6], temp2[1]), - vsubq_f32(evens[7], temp3[1]), + self.rotate90.rotate_both_and_sub(evens[4], temp0[1]), + self.rotate90.rotate_both_and_sub(evens[5], temp1[1]), + self.rotate90.rotate_both_and_sub(evens[6], temp2[1]), + self.rotate90.rotate_both_and_sub(evens[7], temp3[1]), ] } @@ -2918,24 +2914,24 @@ impl NeonF32Butterfly32 { odds3[7] = mul_complex_f32(odds3[7], self.twiddle7c); // step 4: cross FFTs - let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); - let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); - let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); - let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); - let mut temp4 = parallel_fft2_interleaved_f32(odds1[4], odds3[4]); - let mut temp5 = parallel_fft2_interleaved_f32(odds1[5], odds3[5]); - let mut temp6 = parallel_fft2_interleaved_f32(odds1[6], odds3[6]); - let mut temp7 = parallel_fft2_interleaved_f32(odds1[7], odds3[7]); + let temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); + let temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); + let temp4 = parallel_fft2_interleaved_f32(odds1[4], odds3[4]); + let temp5 = parallel_fft2_interleaved_f32(odds1[5], odds3[5]); + let temp6 = parallel_fft2_interleaved_f32(odds1[6], odds3[6]); + let temp7 = parallel_fft2_interleaved_f32(odds1[7], odds3[7]); // apply the butterfly 4 twiddle factor, which is just a rotation - temp0[1] = self.rotate90.rotate_both(temp0[1]); - temp1[1] = self.rotate90.rotate_both(temp1[1]); - temp2[1] = self.rotate90.rotate_both(temp2[1]); - temp3[1] = self.rotate90.rotate_both(temp3[1]); - temp4[1] = self.rotate90.rotate_both(temp4[1]); - temp5[1] = self.rotate90.rotate_both(temp5[1]); - temp6[1] = self.rotate90.rotate_both(temp6[1]); - temp7[1] = self.rotate90.rotate_both(temp7[1]); + //temp0[1] = self.rotate90.rotate_both(temp0[1]); + //temp1[1] = self.rotate90.rotate_both(temp1[1]); + //temp2[1] = self.rotate90.rotate_both(temp2[1]); + //temp3[1] = self.rotate90.rotate_both(temp3[1]); + //temp4[1] = self.rotate90.rotate_both(temp4[1]); + //temp5[1] = self.rotate90.rotate_both(temp5[1]); + //temp6[1] = self.rotate90.rotate_both(temp6[1]); + //temp7[1] = self.rotate90.rotate_both(temp7[1]); //step 5: copy/add/subtract data back to buffer [ @@ -2947,14 +2943,14 @@ impl NeonF32Butterfly32 { vaddq_f32(evens[5], temp5[0]), vaddq_f32(evens[6], temp6[0]), vaddq_f32(evens[7], temp7[0]), - vaddq_f32(evens[8], temp0[1]), - vaddq_f32(evens[9], temp1[1]), - vaddq_f32(evens[10], temp2[1]), - vaddq_f32(evens[11], temp3[1]), - vaddq_f32(evens[12], temp4[1]), - vaddq_f32(evens[13], temp5[1]), - vaddq_f32(evens[14], temp6[1]), - vaddq_f32(evens[15], temp7[1]), + self.rotate90.rotate_both_and_add(evens[8], temp0[1]), + self.rotate90.rotate_both_and_add(evens[9], temp1[1]), + self.rotate90.rotate_both_and_add(evens[10], temp2[1]), + self.rotate90.rotate_both_and_add(evens[11], temp3[1]), + self.rotate90.rotate_both_and_add(evens[12], temp4[1]), + self.rotate90.rotate_both_and_add(evens[13], temp5[1]), + self.rotate90.rotate_both_and_add(evens[14], temp6[1]), + self.rotate90.rotate_both_and_add(evens[15], temp7[1]), vsubq_f32(evens[0], temp0[0]), vsubq_f32(evens[1], temp1[0]), vsubq_f32(evens[2], temp2[0]), @@ -2963,14 +2959,14 @@ impl NeonF32Butterfly32 { vsubq_f32(evens[5], temp5[0]), vsubq_f32(evens[6], temp6[0]), vsubq_f32(evens[7], temp7[0]), - vsubq_f32(evens[8], temp0[1]), - vsubq_f32(evens[9], temp1[1]), - vsubq_f32(evens[10], temp2[1]), - vsubq_f32(evens[11], temp3[1]), - vsubq_f32(evens[12], temp4[1]), - vsubq_f32(evens[13], temp5[1]), - vsubq_f32(evens[14], temp6[1]), - vsubq_f32(evens[15], temp7[1]), + self.rotate90.rotate_both_and_sub(evens[8], temp0[1]), + self.rotate90.rotate_both_and_sub(evens[9], temp1[1]), + self.rotate90.rotate_both_and_sub(evens[10], temp2[1]), + self.rotate90.rotate_both_and_sub(evens[11], temp3[1]), + self.rotate90.rotate_both_and_sub(evens[12], temp4[1]), + self.rotate90.rotate_both_and_sub(evens[13], temp5[1]), + self.rotate90.rotate_both_and_sub(evens[14], temp6[1]), + self.rotate90.rotate_both_and_sub(evens[15], temp7[1]), ] } } @@ -3154,26 +3150,18 @@ impl NeonF64Butterfly32 { odds3[7] = mul_complex_f64(odds3[7], self.twiddle7c); // step 4: cross FFTs - let mut temp0 = solo_fft2_f64(odds1[0], odds3[0]); - let mut temp1 = solo_fft2_f64(odds1[1], odds3[1]); - let mut temp2 = solo_fft2_f64(odds1[2], odds3[2]); - let mut temp3 = solo_fft2_f64(odds1[3], odds3[3]); - let mut temp4 = solo_fft2_f64(odds1[4], odds3[4]); - let mut temp5 = solo_fft2_f64(odds1[5], odds3[5]); - let mut temp6 = solo_fft2_f64(odds1[6], odds3[6]); - let mut temp7 = solo_fft2_f64(odds1[7], odds3[7]); - + let temp0 = solo_fft2_f64(odds1[0], odds3[0]); + let temp1 = solo_fft2_f64(odds1[1], odds3[1]); + let temp2 = solo_fft2_f64(odds1[2], odds3[2]); + let temp3 = solo_fft2_f64(odds1[3], odds3[3]); + let temp4 = solo_fft2_f64(odds1[4], odds3[4]); + let temp5 = solo_fft2_f64(odds1[5], odds3[5]); + let temp6 = solo_fft2_f64(odds1[6], odds3[6]); + let temp7 = solo_fft2_f64(odds1[7], odds3[7]); + + // step 5: copy/add/subtract data back to buffer + // and // apply the butterfly 4 twiddle factor, which is just a rotation - temp0[1] = self.rotate90.rotate(temp0[1]); - temp1[1] = self.rotate90.rotate(temp1[1]); - temp2[1] = self.rotate90.rotate(temp2[1]); - temp3[1] = self.rotate90.rotate(temp3[1]); - temp4[1] = self.rotate90.rotate(temp4[1]); - temp5[1] = self.rotate90.rotate(temp5[1]); - temp6[1] = self.rotate90.rotate(temp6[1]); - temp7[1] = self.rotate90.rotate(temp7[1]); - - //step 5: copy/add/subtract data back to buffer [ vaddq_f64(evens[0], temp0[0]), vaddq_f64(evens[1], temp1[0]), @@ -3183,14 +3171,14 @@ impl NeonF64Butterfly32 { vaddq_f64(evens[5], temp5[0]), vaddq_f64(evens[6], temp6[0]), vaddq_f64(evens[7], temp7[0]), - vaddq_f64(evens[8], temp0[1]), - vaddq_f64(evens[9], temp1[1]), - vaddq_f64(evens[10], temp2[1]), - vaddq_f64(evens[11], temp3[1]), - vaddq_f64(evens[12], temp4[1]), - vaddq_f64(evens[13], temp5[1]), - vaddq_f64(evens[14], temp6[1]), - vaddq_f64(evens[15], temp7[1]), + self.rotate90.rotate_and_add(evens[8], temp0[1]), + self.rotate90.rotate_and_add(evens[9], temp1[1]), + self.rotate90.rotate_and_add(evens[10], temp2[1]), + self.rotate90.rotate_and_add(evens[11], temp3[1]), + self.rotate90.rotate_and_add(evens[12], temp4[1]), + self.rotate90.rotate_and_add(evens[13], temp5[1]), + self.rotate90.rotate_and_add(evens[14], temp6[1]), + self.rotate90.rotate_and_add(evens[15], temp7[1]), vsubq_f64(evens[0], temp0[0]), vsubq_f64(evens[1], temp1[0]), vsubq_f64(evens[2], temp2[0]), @@ -3199,14 +3187,14 @@ impl NeonF64Butterfly32 { vsubq_f64(evens[5], temp5[0]), vsubq_f64(evens[6], temp6[0]), vsubq_f64(evens[7], temp7[0]), - vsubq_f64(evens[8], temp0[1]), - vsubq_f64(evens[9], temp1[1]), - vsubq_f64(evens[10], temp2[1]), - vsubq_f64(evens[11], temp3[1]), - vsubq_f64(evens[12], temp4[1]), - vsubq_f64(evens[13], temp5[1]), - vsubq_f64(evens[14], temp6[1]), - vsubq_f64(evens[15], temp7[1]), + self.rotate90.rotate_and_sub(evens[8], temp0[1]), + self.rotate90.rotate_and_sub(evens[9], temp1[1]), + self.rotate90.rotate_and_sub(evens[10], temp2[1]), + self.rotate90.rotate_and_sub(evens[11], temp3[1]), + self.rotate90.rotate_and_sub(evens[12], temp4[1]), + self.rotate90.rotate_and_sub(evens[13], temp5[1]), + self.rotate90.rotate_and_sub(evens[14], temp6[1]), + self.rotate90.rotate_and_sub(evens[15], temp7[1]), ] } } diff --git a/src/neon/neon_utils.rs b/src/neon/neon_utils.rs index 9fa3f689..27519565 100644 --- a/src/neon/neon_utils.rs +++ b/src/neon/neon_utils.rs @@ -47,6 +47,9 @@ impl Rotate90F32 { #[inline(always)] pub unsafe fn rotate_hi(&self, values: float32x4_t) -> float32x4_t { + // sign = zero in lo + // acc = get only hi + // result = vcmlaq acc + sign*value vcombine_f32( vget_low_f32(values), vreinterpret_f32_u32(veor_u32( @@ -68,6 +71,16 @@ impl Rotate90F32 { let temp = vmovq_n_f32(0.0); vcmlaq_rot90_f32(temp, self.sign_both, values) } + + #[inline(always)] + pub unsafe fn rotate_both_and_add(&self, acc: float32x4_t, values: float32x4_t) -> float32x4_t { + vcmlaq_rot90_f32(acc, self.sign_both, values) + } + + #[inline(always)] + pub unsafe fn rotate_both_and_sub(&self, acc: float32x4_t, values: float32x4_t) -> float32x4_t { + vcmlaq_rot270_f32(acc, self.sign_both, values) + } } // Pack low (1st) complex @@ -178,7 +191,6 @@ pub unsafe fn mul_complex_f32(left: float32x4_t, right: float32x4_t) -> float32x vcmlaq_rot90_f32(step1, left, right) } - // __ __ _ _ __ _ _ _ _ _ // | \/ | __ _| |_| |__ / /_ | || | | |__ (_) |_ // | |\/| |/ _` | __| '_ \ _____ | '_ \| || |_| '_ \| | __| @@ -207,6 +219,16 @@ impl Rotate90F64 { let temp = vmovq_n_f64(0.0); vcmlaq_rot90_f64(temp, self.sign, values) } + + #[inline(always)] + pub unsafe fn rotate_and_add(&self, acc: float64x2_t, values: float64x2_t) -> float64x2_t { + vcmlaq_rot90_f64(acc, self.sign, values) + } + + #[inline(always)] + pub unsafe fn rotate_and_sub(&self, acc: float64x2_t, values: float64x2_t) -> float64x2_t { + vcmlaq_rot270_f64(acc, self.sign, values) + } } #[inline(always)] From 746456221c3f038b478c4c2bfaa5ca76f3803059 Mon Sep 17 00:00:00 2001 From: Henrik Date: Sun, 14 Jan 2024 20:35:16 +0100 Subject: [PATCH 4/5] Some optimization of butterfly 4 --- src/neon/neon_butterflies.rs | 26 ++++++++++++++++---------- src/neon/neon_utils.rs | 9 ++++----- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs index 9ac15cba..e813bf2e 100644 --- a/src/neon/neon_butterflies.rs +++ b/src/neon/neon_butterflies.rs @@ -707,19 +707,22 @@ impl NeonF32Butterfly4 { // and // step 2: column FFTs let temp0 = parallel_fft2_interleaved_f32(values0, values2); - let mut temp1 = parallel_fft2_interleaved_f32(values1, values3); + let temp1 = parallel_fft2_interleaved_f32(values1, values3); // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i) - temp1[1] = self.rotate.rotate_both(temp1[1]); + // skipped because this is applied together with the second row FFT // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous // step 5: row FFTs - let out0 = parallel_fft2_interleaved_f32(temp0[0], temp1[0]); - let out2 = parallel_fft2_interleaved_f32(temp0[1], temp1[1]); + let out0 = vaddq_f32(temp0[0], temp1[0]); + let out1 = vsubq_f32(temp0[0], temp1[0]); + // second row FFT that includes applying the twiddle factor + let out2 = self.rotate.rotate_both_and_add(temp0[1], temp1[1]); + let out3 = self.rotate.rotate_both_and_sub(temp0[1], temp1[1]); // step 6: transpose by swapping index 1 and 2 - [out0[0], out2[0], out0[1], out2[1]] + [out0, out2, out1, out3] } } @@ -787,19 +790,22 @@ impl NeonF64Butterfly4 { // and // step 2: column FFTs let temp0 = solo_fft2_f64(value0, value2); - let mut temp1 = solo_fft2_f64(value1, value3); + let temp1 = solo_fft2_f64(value1, value3); // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i) - temp1[1] = self.rotate.rotate(temp1[1]); + // skipped because this is applied together with the second row FFT // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous // step 5: row FFTs - let out0 = solo_fft2_f64(temp0[0], temp1[0]); - let out2 = solo_fft2_f64(temp0[1], temp1[1]); + let out0 = vaddq_f64(temp0[0], temp1[0]); + let out1 = vsubq_f64(temp0[0], temp1[0]); + // second row FFT that includes applying the twiddle factor + let out2 = self.rotate.rotate_and_add(temp0[1], temp1[1]); + let out3 = self.rotate.rotate_and_sub(temp0[1], temp1[1]); // step 6: transpose by swapping index 1 and 2 - [out0[0], out2[0], out0[1], out2[1]] + [out0, out2, out1, out3] } } diff --git a/src/neon/neon_utils.rs b/src/neon/neon_utils.rs index 27519565..2ea09f3f 100644 --- a/src/neon/neon_utils.rs +++ b/src/neon/neon_utils.rs @@ -185,7 +185,8 @@ pub unsafe fn transpose_complex_2x2_f32(left: float32x4_t, right: float32x4_t) - // Each input contains two complex values, which are multiplied in parallel. #[inline(always)] pub unsafe fn mul_complex_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { - // ARMv8.2-A introduced vcmulq_f32 and vcmlaq_f32 for complex multiplication, these intrinsics are not yet available. + // The complex multiplication intrinsics are all of the type multiply-accumulate, + // thus we need a zero vector to start from. let temp = vmovq_n_f32(0.0); let step1 = vcmlaq_f32(temp, left, right); vcmlaq_rot90_f32(step1, left, right) @@ -233,10 +234,8 @@ impl Rotate90F64 { #[inline(always)] pub unsafe fn mul_complex_f64(left: float64x2_t, right: float64x2_t) -> float64x2_t { - // ARMv8.2-A introduced vcmulq_f64 and vcmlaq_f64 for complex multiplication, these intrinsics are not yet available. - //let temp = vcombine_f64(vneg_f64(vget_high_f64(left)), vget_low_f64(left)); - //let sum = vmulq_laneq_f64::<0>(left, right); - //vfmaq_laneq_f64::<1>(sum, temp, right) + // The complex multiplication intrinsics are all of the type multiply-accumulate, + // thus we need a zero vector to start from. let temp = vmovq_n_f64(0.0); let step1 = vcmlaq_f64(temp, left, right); vcmlaq_rot90_f64(step1, left, right) From b037b507cce95560b80fe261643d917aa935c0c2 Mon Sep 17 00:00:00 2001 From: HEnquist Date: Mon, 29 Jul 2024 21:11:36 +0200 Subject: [PATCH 5/5] Update fcma feature --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 7277c1f7..f3bf8ff1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ #![cfg_attr(all(feature = "bench", test), feature(test))] -#![feature(stdsimd)] +#![feature(stdarch_neon_fcma)] //! RustFFT is a high-performance FFT library written in pure Rust. //!