diff --git a/Cargo.toml b/Cargo.toml index 1f972979..817966ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,2 @@ [workspace] -members = [ "surge-hound", "surge-stepseq", "surge-imports", "surge-biquad", "surge-blitter", "surge-coeffmaker", "surge-constants", "surge-filter", "surge-halfrate", "surge-input", "surge-lag", "surge-lipol", "surge-math", "surge-midi", "surge-modulation", "surge-mpe", "surge-output", "surge-param", "surge-qfunit", "surge-quadrosc", "surge-samplerate", "surge-scene", "surge-voice", "surge-svf", "surge-synthesizer", "surge-tables", "surge-timeunit", "surge-traits", "surge-tuning", "surge-types", "surge-wavetable", "surgefilter-comb", "surgefilter-diode", "surgefilter-iir", "surgefilter-k35", "surgefilter-huovilainen", "surgefilter-rungekutta", "surgefilter-moog", "surgefilter-nlfeedback", "surgefilter-nlstates", "surgefilter-obxd", "surgefilter-snh", "surgefilter-svf", "surgefx-allpass", "surgefx-chorus", "surgefx-conditioner", "surgefx-distortion", "surgefx-dualdelay", "surgefx-emphasize", "surgefx-eq3band", "surgefx-flanger", "surgefx-freqshift", "surgefx-phaser", "surgefx-reverb", "surgefx-ringmod", "surgefx-rotary", "surgefx-vocoder", "surgeosc-audioin", "surgeosc-fm", "surgeosc-fm2", "surgeosc-sine", "surgeosc-snh", "surgeosc-super", "surgeosc-wavetable", "surgeosc-window", "surgeshaper-asym", "surgeshaper-clip", "surgeshaper-digi", "surgeshaper-sine", "surgeshaper-tanh", "surge-derive", "surge-errors" ] \ No newline at end of file +members = [ "surge-hound", "surge-stepseq", "surge-imports", "surge-biquad", "surge-blitter", "surge-coeffmaker", "surge-constants", "surge-filter", "surge-halfrate", "surge-input", "surge-lag", "surge-lipol", "surge-math", "surge-midi", "surge-modulation", "surge-mpe", "surge-output", "surge-param", "surge-qfunit", "surge-quadrosc", "surge-samplerate", "surge-scene", "surge-voice", "surge-svf", "surge-synthesizer", "surge-tables", "surge-timeunit", "surge-traits", "surge-tuning", "surge-types", "surge-wavetable", "surgefilter-comb", "surgefilter-diode", "surgefilter-iir", "surgefilter-k35", "surgefilter-huovilainen", "surgefilter-rungekutta", "surgefilter-moog", "surgefilter-nlfeedback", "surgefilter-nlstates", "surgefilter-obxd", "surgefilter-snh", "surgefilter-svf", "surgefx-allpass", "surgefx-chorus", "surgefx-conditioner", "surgefx-distortion", "surgefx-dualdelay", "surgefx-emphasize", "surgefx-eq3band", "surgefx-flanger", "surgefx-freqshift", "surgefx-phaser", "surgefx-reverb", "surgefx-ringmod", "surgefx-rotary", "surgefx-vocoder", "surgeosc-audioin", "surgeosc-fm", "surgeosc-fm2", "surgeosc-sine", "surgeosc-snh", "surgeosc-super", "surgeosc-wavetable", "surgeosc-window", "surgeshaper-asym", "surgeshaper-clip", "surgeshaper-digi", "surgeshaper-sine", "surgeshaper-tanh", "surge-derive", "surge-errors" , "surge-aarch64"] diff --git a/surge-aarch64/Cargo.toml b/surge-aarch64/Cargo.toml new file mode 100644 index 00000000..a2b303fc --- /dev/null +++ b/surge-aarch64/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "surge-aarch64" +version = "0.1.0" +edition = "2021" + +[dependencies.surge-imports] +path = "../surge-imports" diff --git a/surge-aarch64/src/aarch64.rs b/surge-aarch64/src/aarch64.rs new file mode 100644 index 00000000..fc6041d9 --- /dev/null +++ b/surge-aarch64/src/aarch64.rs @@ -0,0 +1,471 @@ +// https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h + +crate::ix!(); + +use std::arch::aarch64::*; + +pub type __m128 = float32x4_t; +pub type __m128i = int32x4_t; + +#[macro_export] +macro_rules! vreinterpretq_m128i_s32 { + ($x: expr) => { + vreinterpretq_s64_s32(x); + }; +} + +#[macro_export] +macro_rules! vreinterpretq_m128_f32 { + ($x: expr) => { + $x + }; +} + +/** + * Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx + */ +#[inline] +pub fn _mm_load_ss(p: &f32) -> float32x4_t { + return unsafe { + let result: float32x4_t = vdupq_n_f32(0_f32); + vsetq_lane_f32(*p, result, 0) + }; +} + +// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +#[inline] +pub fn _mm_load_ps(p: *const f32) -> float32x4_t { + return unsafe { vld1q_f32(p) }; +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps +#[inline] +pub unsafe fn _mm_load1_ps(p: *const f32) -> float32x4_t { + return vld1q_dup_f32(p); +} + +/** + * Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx + */ +#[inline] +pub fn _mm_max_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + // #if USE_PRECISE_MINMAX_IMPLEMENTATION + // return vbslq_f32(vcltq_f32(b,a),a,b); + // #else + // // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) + // return vmaxq_f32(a, b); + // #endif + + return unsafe { vbslq_f32(vcltq_f32(b, a), a, b) }; +} + +/** + * Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx + */ +#[inline] +pub fn _mm_min_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + // #if USE_PRECISE_MINMAX_IMPLEMENTATION + // return vbslq_f32(vcltq_f32(a,b),a,b); + // #else + // // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) + // return vminq_f32(a, b); + // #endif + + return unsafe { vbslq_f32(vcltq_f32(a, b), a, b) }; +} + +/** + * Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx + */ +#[inline] +pub fn _mm_max_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t { + // float32x4_t value; + // float32x4_t result = a; + + let value = _mm_max_ps(a, b); + return unsafe { vsetq_lane_f32(vgetq_lane_f32(value, 0), a, 0) }; +} + +/** + * Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx + */ +#[inline] +pub fn _mm_min_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t { + let value = _mm_min_ps(a, b); + return unsafe { vsetq_lane_f32(vgetq_lane_f32(value, 0), a, 0) }; +} + +/** + * Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx + */ +#[inline] +pub fn _mm_store_ss(p: *mut f32, a: float32x4_t) { + unsafe { vst1q_lane_f32(p, a, 0) }; +} + +/** + * Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx + */ +#[inline] +pub fn _mm_store_ps(p: *mut f32, a: float32x4_t) { + unsafe { vst1q_f32(p, a) }; +} + +/** + * Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx + */ +#[inline] +pub fn _mm_mul_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + return unsafe { vmulq_f32(a, b) }; +} +/** + * Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx + */ +#[inline] +pub fn _mm_mul_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t { + return unsafe { vmulq_f32(a, b) }; +} + +// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +#[inline] +pub unsafe fn _mm_setzero_ps() -> float32x4_t { + return vdupq_n_f32(0_f32); +} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps +#[inline] +pub unsafe fn _mm_set_ps(w: f32, z: f32, y: f32, x: f32) -> float32x4_t { + // float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + + struct Example { + x: f32, + y: f32, + z: f32, + w: f32, + }; + + let mew = Example { x, y, z, w }; + return vld1q_f32(&mew as *const Example as *const f32); +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +#[inline] +pub fn _mm_set1_ps(_w: f32) -> float32x4_t { + return unsafe { vdupq_n_f32(_w) }; +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +#[inline] +pub fn _mm_set_ps1(_w: f32) -> float32x4_t { + return unsafe { vdupq_n_f32(_w) }; +} + +// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +#[inline] +pub unsafe fn _mm_add_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + return vaddq_f32(a, b); +} + +// adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +#[inline] +pub fn _mm_add_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t { + unsafe { + let b0 = vgetq_lane_f32(b, 0); + let mut value = vdupq_n_f32(0_f32); + + //the upper values in the result must be the remnants of . + value = vsetq_lane_f32(b0, value, 0); + return vaddq_f32(a, value); + }; +} + +// Compute the bitwise AND of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps +#[inline] +pub unsafe fn _mm_and_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + return vreinterpretq_f32_s32(vandq_s32( + vreinterpretq_s32_f32(a), + vreinterpretq_s32_f32(b), + )); +} + +// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +#[inline] +pub fn _mm_sub_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + return unsafe { vsubq_f32(a, b) }; +} + +#[inline] +pub fn _mm_sub_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t { + return unsafe { vsubq_f32(a, b) }; +} + +/*******************************************************/ +/* MACRO for shuffle parameter for _mm_shuffle_ps(). */ +/* Argument fp3 is a digit[0123] that represents the fp*/ +/* from argument "b" of mm_shuffle_ps that will be */ +/* placed in fp3 of result. fp2 is the same for fp2 in */ +/* result. fp1 is a digit[0123] that represents the fp */ +/* from argument "a" of mm_shuffle_ps that will be */ +/* places in fp1 of result. fp0 is the same for fp0 of */ +/* result */ +/*******************************************************/ + +// fn _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } ) + +// fn _MF_SHUFFLE(fp3,fp2,fp1,fp0) { (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } } + +// pub fn _MM_SHUFFLE(fp3: u8, fp2: u8, fp1: u8, fp0: u8) -> u8 { +// ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0) +// } + +#[macro_export] +macro_rules! _MM_SHUFFLE { + ($fp3: literal, $fp2: literal, $fp1: literal, $fp0: literal) => { + ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0) + }; +} + +#[macro_export] +macro_rules! vreinterpretq_f32_m128 { + ($x: expr) => { + $x + }; +} + +// NEON does not support a general purpose permute intrinsic. +// Shuffle single-precision (32-bit) floating-point elements in a using the +// control in imm8, and store the results in dst. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps +// _mm_shuffle_ps_default +// pub fn _mm_shuffle_ps(a: float32x4_t, b: float32x4_t, imm: *const i8) -> float32x4_t { +// unsafe { +// vreinterpretq_m128_f32!(vsetq_lane_f32( +// vgetq_lane_f32(vreinterpretq_f32_m128!(b), ((imm) >> 6) & 0x3), +// vsetq_lane_f32( +// vgetq_lane_f32(vreinterpretq_f32_m128!(b), ((imm) >> 4) & 0x3), +// vsetq_lane_f32( +// vgetq_lane_f32(vreinterpretq_f32_m128!(a), ((imm) >> 2) & 0x3), +// vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128!(a), (imm) & (0x3))), +// 1, +// ), +// 2, +// ), +// 3, +// )) +// } +// } + +// #[macro_export] +// macro_rules! _mm_shuffle_ps { +// ($a: ident, $b: ident, $imm: tt) => { +// unsafe { +// vreinterpretq_m128_f32(vsetq_lane_f32( +// vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), +// vsetq_lane_f32( +// vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), +// vsetq_lane_f32( +// vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), +// vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), +// 1, +// ), +// 2, +// ), +// 3, +// )) +// } +// }; +// } + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps +#[inline] +pub unsafe fn _mm_storeu_ps(p: *mut f32, a: float32x4_t) { + vst1q_f32(p, a); +} + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps +#[inline] +pub unsafe fn _mm_loadu_ps(p: *const f32) -> float32x4_t { + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vld1q_f32(p); +} + +// Compute the approximate reciprocal square root of packed single-precision +// (32-bit) floating-point elements in a, and store the results in dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps +#[inline] +pub unsafe fn _mm_rsqrt_ps(i: float32x4_t) -> float32x4_t { + let mut out: float32x4_t = vrsqrteq_f32(i); + + // Generate masks for detecting whether input has any 0.0f/-0.0f + // (which becomes positive/negative infinity by IEEE-754 arithmetic rules). + let pos_inf: uint32x4_t = vdupq_n_u32(0x7F800000); + let neg_inf: uint32x4_t = vdupq_n_u32(0xFF800000); + let has_pos_zero: uint32x4_t = vceqq_u32(pos_inf, vreinterpretq_u32_f32(out)); + let has_neg_zero: uint32x4_t = vceqq_u32(neg_inf, vreinterpretq_u32_f32(out)); + + out = vmulq_f32(out, vrsqrtsq_f32(vmulq_f32(i, out), out)); + // #if SSE2NEON_PRECISE_SQRT + // // Additional Netwon-Raphson iteration for accuracy + // out = vmulq_f32( + // out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + // #endif + + // Set output vector element to infinity/negative-infinity if + // the corresponding input vector element is 0.0f/-0.0f. + out = vbslq_f32(has_pos_zero, vreinterpretq_f32_u32(pos_inf), out); + out = vbslq_f32(has_neg_zero, vreinterpretq_f32_u32(neg_inf), out); + + return out; +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss +#[inline] +pub unsafe fn _mm_rsqrt_ss(i: float32x4_t) -> float32x4_t { + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(i), 0), i, 0); +} + +// Broadcast 32-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32 +#[inline] +pub unsafe fn _mm_set1_epi32(_i: i32) -> int32x4_t { + return vdupq_n_s32(_i); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 +#[inline] +pub unsafe fn _mm_cvttps_epi32(a: float32x4_t) -> int32x4_t { + return vcvtq_s32_f32(a); +} + +// Add packed 32-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32 +#[inline] +pub unsafe fn _mm_add_epi32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + return vaddq_s32(a, b); +} + +// Cast vector of type __m128i to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps +#[inline] +pub unsafe fn _mm_castsi128_ps(a: int64x2_t) -> float32x4_t { + return vreinterpretq_f32_s32(vreinterpretq_s32_s64(a)); +} + +// #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32 +#[inline] +pub unsafe fn _mm_cvtss_f32(a: float32x4_t) -> f32 { + return vgetq_lane_f32(a, 0); +} + +// Divide packed single-precision (32-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement +// division by multiplying a by b's reciprocal before using the Newton-Raphson +// method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps +#[inline] +pub unsafe fn _mm_div_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t { + // #if defined(__aarch64__) || defined(_M_ARM64) + // return vreinterpretq_m128_f32( + // vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + // #else + // float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + // recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); + // // Additional Netwon-Raphson iteration for accuracy + // recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); + // return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); + // #endif + + return vdivq_f32(a, b); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 +#[inline] +pub unsafe fn _mm_and_si128(a: int32x4_t, b: int32x4_t) -> int32x4_t { + return vandq_s32(a, b); +} + +// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32 +#[inline] +pub unsafe fn _mm_sub_epi32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + return vsubq_s32(a, b); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps +#[inline] +pub unsafe fn _mm_cvtepi32_ps(a: int32x4_t) -> float32x4_t { + return vcvtq_f32_s32(a); +} + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps +#[inline] +pub unsafe fn _mm_rcp_ps(i: float32x4_t) -> float32x4_t { + let mut recip: float32x4_t = vrecpeq_f32(i); + recip = vmulq_f32(recip, vrecpsq_f32(recip, i)); + // TODO: SSE2NEON_PRECISE_DIV + // #if SSE2NEON_PRECISE_DIV + // // Additional Netwon-Raphson iteration for accuracy + // recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); + // #endif + return recip; +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si +#[inline] +pub unsafe fn _mm_cvt_ss2si(a: float32x4_t) -> i32 { + // #if (defined(__aarch64__) || defined(_M_ARM64)) || \ + // defined(__ARM_FEATURE_DIRECTED_ROUNDING) + // return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), + // 0); + // #else + // float32_t data = vgetq_lane_f32( + // vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + // return (int32_t)data; + // #endif + return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(a)), 0); +} diff --git a/surge-aarch64/src/imports.rs b/surge-aarch64/src/imports.rs new file mode 100644 index 00000000..2b6319ac --- /dev/null +++ b/surge-aarch64/src/imports.rs @@ -0,0 +1 @@ +pub(crate) use surge_imports::*; diff --git a/surge-aarch64/src/lib.rs b/surge-aarch64/src/lib.rs new file mode 100644 index 00000000..80e1b7fd --- /dev/null +++ b/surge-aarch64/src/lib.rs @@ -0,0 +1,5 @@ +#[macro_use] +mod imports; +use imports::*; + +x! {aarch64} diff --git a/surge-imports/src/lib.rs b/surge-imports/src/lib.rs index b147c4dd..955faeb6 100644 --- a/surge-imports/src/lib.rs +++ b/surge-imports/src/lib.rs @@ -1,13 +1,14 @@ #![allow(unused_imports)] /* -#[macro_export] macro_rules! xt { - ($x:ident) => { - #[cfg(test)] mod $x; +#[macro_export] macro_rules! xt { + ($x:ident) => { + #[cfg(test)] mod $x; } } */ +#[cfg(target_arch = "x86_64")] pub fn assert_m128_eq(a: __m128, b: __m128, epsilon: f32) { let a_arr: [f32; 4] = unsafe { std::mem::transmute(a) }; @@ -24,9 +25,7 @@ pub fn assert_m128_eq(a: __m128, b: __m128, epsilon: f32) { } } -#[cfg(target_arch = "x86_64")] -pub use core::arch::x86_64::*; - +#[cfg(target_arch = "x86_64")] pub use core::arch::x86_64::*; pub use disable_macro::disable; diff --git a/surge-math/Cargo.toml b/surge-math/Cargo.toml index d933a6fe..0eea617d 100644 --- a/surge-math/Cargo.toml +++ b/surge-math/Cargo.toml @@ -1,5 +1,7 @@ [dependencies] criterion = "0.3" +[dependencies.surge-aarch64] +path = "../surge-aarch64" [dependencies.surge-errors] path = "../surge-errors" [dependencies.surge-imports] @@ -14,4 +16,4 @@ name = "surge-math" repository = "https://github.com/klebz/surge-rs" version = "0.2.12-alpha.0" [package.metadata] -cpp-authors = [ "Adam Raisbeck ", "Alec DiAstra ", "Alexandre Bique ", "Amy Furniss ", "Brian Ginsburg ", "Cale Gibbard ", "Claes Johanson ", "Dave Palmer ", "David Bata ", "David Carlier ", "Erik-Jan Maalderink ", "Esa Juhani Ruoho ", "Filipe Coelho ", "Hendrik van Boetzelaer ", "Jacky Ligon ", "Jani Frilander ", "Jarkko Sakkinen ", "Jatin Chowdhury ", "Jean Pierre Cimalando ", "Jeff Smith ", "Jeremy Sagan ", "Jon Spruyt ", "Keith Zantow ", "Kjetil Matheussen ", "Korridor ", "Luna Langton ", "Mario Krušelj a.k.a. EvilDragon ", "Markku Tavasti ", "Marty Lake ", "Matthias von Faber ", "Mikko Saarinki ", "Nathan Kopp ", "Paul Walker ", "Rich Elmes ", "Sebastian Schwarzhuber ", "The Emu (J Riley Hill) ", "Vincent Zauhar ", "klebs ", "muhl ", "簡志瑋 " ] \ No newline at end of file +cpp-authors = [ "Adam Raisbeck ", "Alec DiAstra ", "Alexandre Bique ", "Amy Furniss ", "Brian Ginsburg ", "Cale Gibbard ", "Claes Johanson ", "Dave Palmer ", "David Bata ", "David Carlier ", "Erik-Jan Maalderink ", "Esa Juhani Ruoho ", "Filipe Coelho ", "Hendrik van Boetzelaer ", "Jacky Ligon ", "Jani Frilander ", "Jarkko Sakkinen ", "Jatin Chowdhury ", "Jean Pierre Cimalando ", "Jeff Smith ", "Jeremy Sagan ", "Jon Spruyt ", "Keith Zantow ", "Kjetil Matheussen ", "Korridor ", "Luna Langton ", "Mario Krušelj a.k.a. EvilDragon ", "Markku Tavasti ", "Marty Lake ", "Matthias von Faber ", "Mikko Saarinki ", "Nathan Kopp ", "Paul Walker ", "Rich Elmes ", "Sebastian Schwarzhuber ", "The Emu (J Riley Hill) ", "Vincent Zauhar ", "klebs ", "muhl ", "簡志瑋 " ] diff --git a/surge-math/src/absmax.rs b/surge-math/src/absmax.rs index 2dda4fba..bfdbf91f 100644 --- a/surge-math/src/absmax.rs +++ b/surge-math/src/absmax.rs @@ -1,8 +1,8 @@ crate::ix!(); -#[inline] pub fn abs_ps(x: __m128 ) -> __m128 -{ - unsafe{ _mm_and_ps(x, m128_mask_absval![]) } +#[inline] +pub fn abs_ps(x: __m128) -> __m128 { + unsafe { _mm_and_ps(x, m128_mask_absval![]) } } /// Returns the maximum absolute value found in a memory @@ -17,20 +17,20 @@ crate::ix!(); /// The number of blocks is determined by the nquads /// parameter. /// -#[inline] pub unsafe fn get_absmax(d: *mut f32, nquads: usize) -> f32 -{ +#[cfg(target_arch = "x86_64")] +#[inline] +pub unsafe fn get_absmax(d: *mut f32, nquads: usize) -> f32 { let mut mx1: __m128 = _mm_setzero_ps(); let mut mx2: __m128 = _mm_setzero_ps(); for i in (0..nquads).step_by(2) { - - let d_0 = access(d,i); - let d_1 = access(d,i+1); + let d_0 = access(d, i); + let d_1 = access(d, i + 1); let mask = m128_mask_absval![]; - mx1 = _mm_max_ps( mx1, _mm_and_ps(*d_0, mask)); - mx2 = _mm_max_ps( mx2, _mm_and_ps(*d_1, mask)); + mx1 = _mm_max_ps(mx1, _mm_and_ps(*d_0, mask)); + mx2 = _mm_max_ps(mx2, _mm_and_ps(*d_1, mask)); } mx1 = _mm_max_ps(mx1, mx2); @@ -44,10 +44,11 @@ crate::ix!(); } /// Returns the maximum absolute value found in two memory blocks. -/// +/// /// The `d1` and `d2` pointers are expected to be aligned to a multiple of 16 bytes. /// This function processes the input data in blocks of four 32-bit floating-point numbers. /// The number of blocks is determined by the `nblocks` parameter. +#[cfg(target_arch = "x86_64")] #[inline] pub unsafe fn get_absmax_2(d1: *const f32, d2: *const f32, nblocks: usize) -> f32 { assert_eq!((d1 as usize) % 16, 0, "d1 pointer must be aligned"); @@ -74,14 +75,16 @@ pub unsafe fn get_absmax_2(d1: *const f32, d2: *const f32, nblocks: usize) -> f3 max_val = _mm_max_ps(max_val, max_d1_d2); // Update global max } - + // Horizontal max to reduce all values in `max_val` to a single maximum value - let temp1 = _mm_max_ps(max_val, _mm_shuffle_ps(max_val, max_val, _MM_SHUFFLE(2, 3, 0, 1))); + let temp1 = _mm_max_ps( + max_val, + _mm_shuffle_ps(max_val, max_val, _MM_SHUFFLE(2, 3, 0, 1)), + ); let temp2 = _mm_max_ps(temp1, _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(1, 0, 3, 2))); // Extract the maximum value from the SIMD register let result = _mm_cvtss_f32(temp2); - result } diff --git a/surge-math/src/get_sign.rs b/surge-math/src/get_sign.rs index d5350dfe..0ab8daa3 100644 --- a/surge-math/src/get_sign.rs +++ b/surge-math/src/get_sign.rs @@ -1,3 +1,5 @@ +use std::arch::asm; + crate::ix!(); /// Returns the sign of an i32 integer as -1 or 1. @@ -8,11 +10,10 @@ crate::ix!(); /// On other architectures, the function uses inline /// assembly. /// -#[cfg(target_arch = "x86_64")] -pub fn sign(x: i32) -> i32 -{ - match x < 0 { - true => -1, +#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] +pub fn sign(x: i32) -> i32 { + match x < 0 { + true => -1, false => 1, } } @@ -26,7 +27,7 @@ pub fn sign(x: i32) -> i32 /// Note that inline assembly is only supported on the /// nightly Rust. /// -#[cfg(not(target_arch = "x86_64"))] +#[cfg(all(not(target_arch = "x86_64"), not(target_arch = "aarch64")))] pub fn sign(x: i32) -> i32 { let result: i32; unsafe { @@ -41,5 +42,3 @@ pub fn sign(x: i32) -> i32 { } result } - - diff --git a/surge-math/src/imports.rs b/surge-math/src/imports.rs index 62760f34..a22be33b 100644 --- a/surge-math/src/imports.rs +++ b/surge-math/src/imports.rs @@ -1,2 +1,5 @@ pub(crate) use surge_errors::*; pub(crate) use surge_imports::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) use surge_aarch64::*; diff --git a/surge-math/src/lib.rs b/surge-math/src/lib.rs index 43b3e74f..13a51c8b 100644 --- a/surge-math/src/lib.rs +++ b/surge-math/src/lib.rs @@ -3,50 +3,54 @@ #![allow(unused_imports)] #![feature(test)] #![feature(core_intrinsics)] -#![feature(stdarch_x86_mm_shuffle)] +// #[cfg(target_arch = "x86_64")] +// #![feature(stdarch_x86_mm_shuffle)] #![feature(trait_alias)] -#[macro_use] mod imports; use imports::*; -#[macro_use] pub mod m128; +#[macro_use] +mod imports; +use imports::*; +#[macro_use] +pub mod m128; -x!{align} -x!{error} -x!{fill} -x!{allpass} -x!{basic_ops} -x!{clamp} -x!{clear} -x!{convert} -x!{copy} -x!{correlated_noise} -x!{encode} -x!{get_sign} -x!{absmax} -x!{access} -x!{mul} -x!{sub} -x!{sum} -x!{add} -x!{accum} -x!{fast} -x!{flops} -x!{hardclip} -x!{interp} -x!{limit_range} -x!{ord} -x!{padding} -x!{pan} -x!{saturate} -x!{scan} -x!{scratch} -x!{simd} -x!{sinc} -x!{sine} -x!{softclip} -x!{square} -x!{tanh} -x!{trixpan} -x!{utility} -x!{value_cast} -x!{white_noise} -x!{window} +x! {align} +x! {error} +x! {fill} +x! {allpass} +x! {basic_ops} +x! {clamp} +x! {clear} +x! {convert} +x! {copy} +x! {correlated_noise} +x! {encode} +x! {get_sign} +x! {absmax} +x! {access} +x! {mul} +x! {sub} +x! {sum} +x! {add} +x! {accum} +x! {fast} +x! {flops} +x! {hardclip} +x! {interp} +x! {limit_range} +x! {ord} +x! {padding} +x! {pan} +x! {saturate} +x! {scan} +x! {scratch} +x! {simd} +x! {sinc} +x! {sine} +x! {softclip} +x! {square} +x! {tanh} +x! {trixpan} +x! {utility} +x! {value_cast} +x! {white_noise} +x! {window} diff --git a/surge-math/src/limit_range.rs b/surge-math/src/limit_range.rs index c6fecf0a..8da5f3c8 100644 --- a/surge-math/src/limit_range.rs +++ b/surge-math/src/limit_range.rs @@ -1,69 +1,61 @@ crate::ix!(); pub trait LimitRange: Sized { - fn limit_range(self, low: Self, hi: Self) -> Self; /** - | provides a possibility for taking a - | non-simd path. if we don't customize it, - | the ordinary simd code will be called if it - | is used in "limit_range" - | - */ - #[inline] fn limit_range_nosimd(self, low: Self, hi: Self) -> Self { - self.limit_range(low,hi) + | provides a possibility for taking a + | non-simd path. if we don't customize it, + | the ordinary simd code will be called if it + | is used in "limit_range" + | + */ + #[inline] + fn limit_range_nosimd(self, low: Self, hi: Self) -> Self { + self.limit_range(low, hi) } } pub fn limit_range(x: T, low: T, high: T) -> T { - x.limit_range(low,high) + x.limit_range(low, high) } impl LimitRange for f32 { - - #[cfg(target_arch = "x86_64")] #[inline] + // #[cfg(target_arch = "x86_64")] + #[inline] fn limit_range(self, low: Self, high: Self) -> Self { - let mut result: f32 = 0.0; unsafe { - - let _s = _mm_load_ss(&self); - let _low = _mm_load_ss(&low); + let _s = _mm_load_ss(&self); + let _low = _mm_load_ss(&low); let _high = _mm_load_ss(&high); let limited = _mm_min_ss(_mm_max_ss(_s, _low), _high); - _mm_store_ss( &mut result, limited) + _mm_store_ss(&mut result, limited) }; result } fn limit_range_nosimd(self, min: f32, max: f32) -> f32 { - std::cmp::max(std::cmp::min(FloatOrd(self),FloatOrd(max)), FloatOrd(min)).0 + std::cmp::max(std::cmp::min(FloatOrd(self), FloatOrd(max)), FloatOrd(min)).0 } } impl LimitRange for i32 { - - fn limit_range(self, l: i32, h: i32) -> i32 - { - std::cmp::max(std::cmp::min(self,h), l) + fn limit_range(self, l: i32, h: i32) -> i32 { + std::cmp::max(std::cmp::min(self, h), l) } } impl LimitRange for u32 { - fn limit_range(self, l: u32, h: u32) -> u32 - { - std::cmp::max(std::cmp::min(self,h), l) + fn limit_range(self, l: u32, h: u32) -> u32 { + std::cmp::max(std::cmp::min(self, h), l) } } - impl LimitRange for f64 { - fn limit_range(self, min:f64, max:f64) -> f64 - { - std::cmp::max(std::cmp::min(FloatOrd(self),FloatOrd(max)), FloatOrd(min)).0 + fn limit_range(self, min: f64, max: f64) -> f64 { + std::cmp::max(std::cmp::min(FloatOrd(self), FloatOrd(max)), FloatOrd(min)).0 } } - diff --git a/surge-math/src/square.rs b/surge-math/src/square.rs index c9bedcb2..38f1bf06 100644 --- a/surge-math/src/square.rs +++ b/surge-math/src/square.rs @@ -1,30 +1,41 @@ crate::ix!(); -#[inline] pub fn v_sqrt_fast(v: __m128) -> __m128 -{ - unsafe { - _mm_rcp_ps(_mm_rsqrt_ps(v)) - } +#[inline] +pub fn v_sqrt_fast(v: __m128) -> __m128 { + unsafe { _mm_rcp_ps(_mm_rsqrt_ps(v)) } } -#[inline] pub fn square(x: f64) -> f64 { +#[inline] +pub fn square(x: f64) -> f64 { x * x } -pub fn get_squaremax(d: *mut f32, nquads: NQ) -> f32 -where >::Error: std::fmt::Debug, - NQ: TryInto +#[cfg(target_arch = "x86_64")] +pub fn get_squaremax(d: *mut f32, nquads: NQ) -> f32 +where + >::Error: std::fmt::Debug, + NQ: TryInto, { let nquads: u32 = nquads.try_into().unwrap(); unsafe { - let mut mx1: __m128 = _mm_setzero_ps(); let mut mx2: __m128 = _mm_setzero_ps(); - for i in (0..nquads).step_by(2) - { - mx1 = _mm_max_ps(mx1, _mm_mul_ps(*(d as *mut __m128).offset(i as isize), *(d as *mut __m128).offset(i as isize))); - mx2 = _mm_max_ps(mx2, _mm_mul_ps(*(d as *mut __m128).offset((i + 1) as isize), *(d as *mut __m128).offset((i + 1) as isize))); + for i in (0..nquads).step_by(2) { + mx1 = _mm_max_ps( + mx1, + _mm_mul_ps( + *(d as *mut __m128).offset(i as isize), + *(d as *mut __m128).offset(i as isize), + ), + ); + mx2 = _mm_max_ps( + mx2, + _mm_mul_ps( + *(d as *mut __m128).offset((i + 1) as isize), + *(d as *mut __m128).offset((i + 1) as isize), + ), + ); } mx1 = _mm_max_ps(mx1, mx2); diff --git a/surge-math/src/sum.rs b/surge-math/src/sum.rs index d97fc247..f651bfc4 100644 --- a/surge-math/src/sum.rs +++ b/surge-math/src/sum.rs @@ -3,24 +3,15 @@ crate::ix!(); /// Sums the elements of a __m128 vector and returns the /// result as a f32 value. /// -#[inline] pub fn v_sum(x: __m128) -> f32 -{ +#[cfg(target_arch = "x86_64")] +#[inline] +pub fn v_sum(x: __m128) -> f32 { unsafe { - let mut f: f32 = 0.0; - let mut a: __m128 = _mm_add_ps( - x, - _mm_movehl_ps(x, x) - ); + let mut a: __m128 = _mm_add_ps(x, _mm_movehl_ps(x, x)); - a = _mm_add_ss( - a, - _mm_shuffle_ps( - a, - a, - _MM_SHUFFLE(0, 0, 0, 1)) - ); + a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE!(0, 0, 0, 1))); _mm_store_ss(&mut f, a); diff --git a/surge-math/src/tanh.rs b/surge-math/src/tanh.rs index 268f3e70..959a8fb2 100644 --- a/surge-math/src/tanh.rs +++ b/surge-math/src/tanh.rs @@ -1,7 +1,7 @@ crate::ix!(); #[allow(clippy::many_single_char_names)] -#[cfg(target_arch = "x86_64")] #[inline] +#[cfg(target_arch = "x86_64")] #[inline] pub fn tanh7_ps(x: __m128) -> __m128 { unsafe { @@ -17,12 +17,12 @@ pub fn tanh7_ps(x: __m128) -> __m128 x4 = _mm_mul_ps(x4, xx) ; y = _mm_add_ps(y, _mm_mul_ps(c, x4)) ; - _mm_mul_ps(y, x) + _mm_mul_ps(y, x) } } -#[cfg(target_arch = "x86_64")] #[inline] +#[cfg(target_arch = "x86_64")] #[inline] pub fn tanh_fast32(x_in: f32) -> f32 { assert!(within_range(0.0, x_in, 1.0)); @@ -35,7 +35,7 @@ pub fn tanh_fast32(x_in: f32) -> f32 { let mut denom: f32 = 1.0 + x + xx + (a * x * xx); - unsafe{ _mm_store_ss(&mut denom, + unsafe{ _mm_store_ss(&mut denom, _mm_rcp_ss(_mm_load_ss(&denom)))}; match x_in > 0.0 { @@ -45,7 +45,8 @@ pub fn tanh_fast32(x_in: f32) -> f32 { } #[allow(clippy::many_single_char_names)] -#[inline] pub fn tanh7_ss(x: __m128 ) -> __m128 +#[cfg(target_arch = "x86_64")] #[inline] +pub fn tanh7_ss(x: __m128 ) -> __m128 { unsafe { let a: __m128 = _mm_set1_ps(-1.0 / 3.0); @@ -63,9 +64,10 @@ pub fn tanh_fast32(x_in: f32) -> f32 { } #[allow(clippy::many_single_char_names)] -#[inline] pub fn tanh7_f64(x: f64 ) -> f64 +#[cfg(target_arch = "x86_64")] #[inline] +pub fn tanh7_f64(x: f64 ) -> f64 { - let a: f64 = -1.0 / 3.0; + let a: f64 = -1.0 / 3.0; let b: f64 = 2.0 / 15.0; let c: f64 = -17.0 / 315.0; // return tanh(x); @@ -84,6 +86,7 @@ pub fn tanh_fast32(x_in: f32) -> f32 { } #[allow(clippy::many_single_char_names)] +#[cfg(target_arch = "x86_64")] pub fn tanh7_block(xb: *mut f32, nquads: NQ) where >::Error: std::fmt::Debug, NQ: TryInto { @@ -96,8 +99,8 @@ pub fn tanh7_block(xb: *mut f32, nquads: NQ) let upper_bound: __m128 = _mm_set1_ps(1.1390); let lower_bound: __m128 = _mm_set1_ps(-1.1390); - let mut t: [__m128; 4] = [z128![]; 4]; - let mut x: [__m128; 4] = [z128![]; 4]; + let mut t: [__m128; 4] = [z128![]; 4]; + let mut x: [__m128; 4] = [z128![]; 4]; let mut xx: [__m128; 4] = [z128![]; 4]; for i in (0..nquads).step_by(4) @@ -183,9 +186,9 @@ pub fn tanh7_block(xb: *mut f32, nquads: NQ) y * x_in } -pub fn shafted_tanh(x: f64) -> f64 +pub fn shafted_tanh(x: f64) -> f64 { - (x.exp() - (-x * 1.2).exp()) / + (x.exp() - (-x * 1.2).exp()) / ((x).exp() + (-x).exp()) }