diff --git a/Cargo.toml b/Cargo.toml
index 1f972979..817966ca 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,2 +1,2 @@
 [workspace]
-members = [ "surge-hound", "surge-stepseq", "surge-imports", "surge-biquad", "surge-blitter", "surge-coeffmaker", "surge-constants", "surge-filter", "surge-halfrate", "surge-input", "surge-lag", "surge-lipol", "surge-math", "surge-midi", "surge-modulation", "surge-mpe", "surge-output", "surge-param", "surge-qfunit", "surge-quadrosc", "surge-samplerate", "surge-scene", "surge-voice", "surge-svf", "surge-synthesizer", "surge-tables", "surge-timeunit", "surge-traits", "surge-tuning", "surge-types", "surge-wavetable", "surgefilter-comb", "surgefilter-diode", "surgefilter-iir", "surgefilter-k35", "surgefilter-huovilainen", "surgefilter-rungekutta", "surgefilter-moog", "surgefilter-nlfeedback", "surgefilter-nlstates", "surgefilter-obxd", "surgefilter-snh", "surgefilter-svf", "surgefx-allpass", "surgefx-chorus", "surgefx-conditioner", "surgefx-distortion", "surgefx-dualdelay", "surgefx-emphasize", "surgefx-eq3band", "surgefx-flanger", "surgefx-freqshift", "surgefx-phaser", "surgefx-reverb", "surgefx-ringmod", "surgefx-rotary", "surgefx-vocoder", "surgeosc-audioin", "surgeosc-fm", "surgeosc-fm2", "surgeosc-sine", "surgeosc-snh", "surgeosc-super", "surgeosc-wavetable", "surgeosc-window", "surgeshaper-asym", "surgeshaper-clip", "surgeshaper-digi", "surgeshaper-sine", "surgeshaper-tanh", "surge-derive", "surge-errors" ]
\ No newline at end of file
+members = [ "surge-hound", "surge-stepseq", "surge-imports", "surge-biquad", "surge-blitter", "surge-coeffmaker", "surge-constants", "surge-filter", "surge-halfrate", "surge-input", "surge-lag", "surge-lipol", "surge-math", "surge-midi", "surge-modulation", "surge-mpe", "surge-output", "surge-param", "surge-qfunit", "surge-quadrosc", "surge-samplerate", "surge-scene", "surge-voice", "surge-svf", "surge-synthesizer", "surge-tables", "surge-timeunit", "surge-traits", "surge-tuning", "surge-types", "surge-wavetable", "surgefilter-comb", "surgefilter-diode", "surgefilter-iir", "surgefilter-k35", "surgefilter-huovilainen", "surgefilter-rungekutta", "surgefilter-moog", "surgefilter-nlfeedback", "surgefilter-nlstates", "surgefilter-obxd", "surgefilter-snh", "surgefilter-svf", "surgefx-allpass", "surgefx-chorus", "surgefx-conditioner", "surgefx-distortion", "surgefx-dualdelay", "surgefx-emphasize", "surgefx-eq3band", "surgefx-flanger", "surgefx-freqshift", "surgefx-phaser", "surgefx-reverb", "surgefx-ringmod", "surgefx-rotary", "surgefx-vocoder", "surgeosc-audioin", "surgeosc-fm", "surgeosc-fm2", "surgeosc-sine", "surgeosc-snh", "surgeosc-super", "surgeosc-wavetable", "surgeosc-window", "surgeshaper-asym", "surgeshaper-clip", "surgeshaper-digi", "surgeshaper-sine", "surgeshaper-tanh", "surge-derive", "surge-errors" , "surge-aarch64"]
diff --git a/surge-aarch64/Cargo.toml b/surge-aarch64/Cargo.toml
new file mode 100644
index 00000000..a2b303fc
--- /dev/null
+++ b/surge-aarch64/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "surge-aarch64"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies.surge-imports]
+path = "../surge-imports"
diff --git a/surge-aarch64/src/aarch64.rs b/surge-aarch64/src/aarch64.rs
new file mode 100644
index 00000000..fc6041d9
--- /dev/null
+++ b/surge-aarch64/src/aarch64.rs
@@ -0,0 +1,471 @@
+// https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
+
+crate::ix!();
+
+use std::arch::aarch64::*;
+
+pub type __m128 = float32x4_t;
+pub type __m128i = int32x4_t;
+
+#[macro_export]
+macro_rules! vreinterpretq_m128i_s32 {
+    ($x: expr) => {
+        vreinterpretq_s64_s32(x);
+    };
+}
+
+#[macro_export]
+macro_rules! vreinterpretq_m128_f32 {
+    ($x: expr) => {
+        $x
+    };
+}
+
+/**
+ * Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+ */
+#[inline]
+pub fn _mm_load_ss(p: &f32) -> float32x4_t {
+    return unsafe {
+        let result: float32x4_t = vdupq_n_f32(0_f32);
+        vsetq_lane_f32(*p, result, 0)
+    };
+}
+
+// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+#[inline]
+pub fn _mm_load_ps(p: *const f32) -> float32x4_t {
+    return unsafe { vld1q_f32(p) };
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+#[inline]
+pub unsafe fn _mm_load1_ps(p: *const f32) -> float32x4_t {
+    return vld1q_dup_f32(p);
+}
+
+/**
+ * Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_max_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    // #if USE_PRECISE_MINMAX_IMPLEMENTATION
+    //   return vbslq_f32(vcltq_f32(b,a),a,b);
+    // #else
+    //   // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+    //   return vmaxq_f32(a, b);
+    // #endif
+
+    return unsafe { vbslq_f32(vcltq_f32(b, a), a, b) };
+}
+
+/**
+ * Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_min_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    // #if USE_PRECISE_MINMAX_IMPLEMENTATION
+    //   return vbslq_f32(vcltq_f32(a,b),a,b);
+    // #else
+    //   // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+    //   return vminq_f32(a, b);
+    // #endif
+
+    return unsafe { vbslq_f32(vcltq_f32(a, b), a, b) };
+}
+
+/**
+ * Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_max_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    //   float32x4_t value;
+    //   float32x4_t result = a;
+
+    let value = _mm_max_ps(a, b);
+    return unsafe { vsetq_lane_f32(vgetq_lane_f32(value, 0), a, 0) };
+}
+
+/**
+ * Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_min_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    let value = _mm_min_ps(a, b);
+    return unsafe { vsetq_lane_f32(vgetq_lane_f32(value, 0), a, 0) };
+}
+
+/**
+ * Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_store_ss(p: *mut f32, a: float32x4_t) {
+    unsafe { vst1q_lane_f32(p, a, 0) };
+}
+
+/**
+ * Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_store_ps(p: *mut f32, a: float32x4_t) {
+    unsafe { vst1q_f32(p, a) };
+}
+
+/**
+ * Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_mul_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    return unsafe { vmulq_f32(a, b) };
+}
+/**
+ * Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+ */
+#[inline]
+pub fn _mm_mul_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    return unsafe { vmulq_f32(a, b) };
+}
+
+// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+#[inline]
+pub unsafe fn _mm_setzero_ps() -> float32x4_t {
+    return vdupq_n_f32(0_f32);
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+#[inline]
+pub unsafe fn _mm_set_ps(w: f32, z: f32, y: f32, x: f32) -> float32x4_t {
+    //   float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+
+    struct Example {
+        x: f32,
+        y: f32,
+        z: f32,
+        w: f32,
+    };
+
+    let mew = Example { x, y, z, w };
+    return vld1q_f32(&mew as *const Example as *const f32);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+#[inline]
+pub fn _mm_set1_ps(_w: f32) -> float32x4_t {
+    return unsafe { vdupq_n_f32(_w) };
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+#[inline]
+pub fn _mm_set_ps1(_w: f32) -> float32x4_t {
+    return unsafe { vdupq_n_f32(_w) };
+}
+
+// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+#[inline]
+pub unsafe fn _mm_add_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    return vaddq_f32(a, b);
+}
+
+// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+#[inline]
+pub fn _mm_add_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    unsafe {
+        let b0 = vgetq_lane_f32(b, 0);
+        let mut value = vdupq_n_f32(0_f32);
+
+        //the upper values in the result must be the remnants of <a>.
+        value = vsetq_lane_f32(b0, value, 0);
+        return vaddq_f32(a, value);
+    };
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+#[inline]
+pub unsafe fn _mm_and_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    return vreinterpretq_f32_s32(vandq_s32(
+        vreinterpretq_s32_f32(a),
+        vreinterpretq_s32_f32(b),
+    ));
+}
+
+// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+#[inline]
+pub fn _mm_sub_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    return unsafe { vsubq_f32(a, b) };
+}
+
+#[inline]
+pub fn _mm_sub_ss(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    return unsafe { vsubq_f32(a, b) };
+}
+
+/*******************************************************/
+/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
+/* Argument fp3 is a digit[0123] that represents the fp*/
+/* from argument "b" of mm_shuffle_ps that will be     */
+/* placed in fp3 of result. fp2 is the same for fp2 in */
+/* result. fp1 is a digit[0123] that represents the fp */
+/* from argument "a" of mm_shuffle_ps that will be     */
+/* places in fp1 of result. fp0 is the same for fp0 of */
+/* result                                              */
+/*******************************************************/
+
+// fn _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
+
+// fn _MF_SHUFFLE(fp3,fp2,fp1,fp0) { (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } }
+
+// pub fn _MM_SHUFFLE(fp3: u8, fp2: u8, fp1: u8, fp0: u8) -> u8 {
+//     ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)
+// }
+
+#[macro_export]
+macro_rules! _MM_SHUFFLE {
+    ($fp3: literal, $fp2: literal, $fp1: literal, $fp0: literal) => {
+        ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)
+    };
+}
+
+#[macro_export]
+macro_rules! vreinterpretq_f32_m128 {
+    ($x: expr) => {
+        $x
+    };
+}
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+// _mm_shuffle_ps_default
+// pub fn _mm_shuffle_ps(a: float32x4_t, b: float32x4_t, imm: *const i8) -> float32x4_t {
+//     unsafe {
+//         vreinterpretq_m128_f32!(vsetq_lane_f32(
+//             vgetq_lane_f32(vreinterpretq_f32_m128!(b), ((imm) >> 6) & 0x3),
+//             vsetq_lane_f32(
+//                 vgetq_lane_f32(vreinterpretq_f32_m128!(b), ((imm) >> 4) & 0x3),
+//                 vsetq_lane_f32(
+//                     vgetq_lane_f32(vreinterpretq_f32_m128!(a), ((imm) >> 2) & 0x3),
+//                     vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128!(a), (imm) & (0x3))),
+//                     1,
+//                 ),
+//                 2,
+//             ),
+//             3,
+//         ))
+//     }
+// }
+
+// #[macro_export]
+// macro_rules! _mm_shuffle_ps {
+//     ($a: ident, $b: ident, $imm: tt) => {
+//         unsafe {
+//             vreinterpretq_m128_f32(vsetq_lane_f32(
+//                 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),
+//                 vsetq_lane_f32(
+//                     vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),
+//                     vsetq_lane_f32(
+//                         vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3),
+//                         vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))),
+//                         1,
+//                     ),
+//                     2,
+//                 ),
+//                 3,
+//             ))
+//         }
+//     };
+// }
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+#[inline]
+pub unsafe fn _mm_storeu_ps(p: *mut f32, a: float32x4_t) {
+    vst1q_f32(p, a);
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+#[inline]
+pub unsafe fn _mm_loadu_ps(p: *const f32) -> float32x4_t {
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vld1q_f32(p);
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+#[inline]
+pub unsafe fn _mm_rsqrt_ps(i: float32x4_t) -> float32x4_t {
+    let mut out: float32x4_t = vrsqrteq_f32(i);
+
+    // Generate masks for detecting whether input has any 0.0f/-0.0f
+    // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
+    let pos_inf: uint32x4_t = vdupq_n_u32(0x7F800000);
+    let neg_inf: uint32x4_t = vdupq_n_u32(0xFF800000);
+    let has_pos_zero: uint32x4_t = vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
+    let has_neg_zero: uint32x4_t = vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
+
+    out = vmulq_f32(out, vrsqrtsq_f32(vmulq_f32(i, out), out));
+    // #if SSE2NEON_PRECISE_SQRT
+    //   // Additional Netwon-Raphson iteration for accuracy
+    //   out = vmulq_f32(
+    //       out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    // #endif
+
+    // Set output vector element to infinity/negative-infinity if
+    // the corresponding input vector element is 0.0f/-0.0f.
+    out = vbslq_f32(has_pos_zero, vreinterpretq_f32_u32(pos_inf), out);
+    out = vbslq_f32(has_neg_zero, vreinterpretq_f32_u32(neg_inf), out);
+
+    return out;
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+#[inline]
+pub unsafe fn _mm_rsqrt_ss(i: float32x4_t) -> float32x4_t {
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(i), 0), i, 0);
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+#[inline]
+pub unsafe fn _mm_set1_epi32(_i: i32) -> int32x4_t {
+    return vdupq_n_s32(_i);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+#[inline]
+pub unsafe fn _mm_cvttps_epi32(a: float32x4_t) -> int32x4_t {
+    return vcvtq_s32_f32(a);
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+#[inline]
+pub unsafe fn _mm_add_epi32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    return vaddq_s32(a, b);
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+#[inline]
+pub unsafe fn _mm_castsi128_ps(a: int64x2_t) -> float32x4_t {
+    return vreinterpretq_f32_s32(vreinterpretq_s32_s64(a));
+}
+
+// #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+#[inline]
+pub unsafe fn _mm_cvtss_f32(a: float32x4_t) -> f32 {
+    return vgetq_lane_f32(a, 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+#[inline]
+pub unsafe fn _mm_div_ps(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    // #if defined(__aarch64__) || defined(_M_ARM64)
+    //   return vreinterpretq_m128_f32(
+    //       vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    // #else
+    //   float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    //   recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    //   // Additional Netwon-Raphson iteration for accuracy
+    //   recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    //   return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+    // #endif
+
+    return vdivq_f32(a, b);
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+#[inline]
+pub unsafe fn _mm_and_si128(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    return vandq_s32(a, b);
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+#[inline]
+pub unsafe fn _mm_sub_epi32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    return vsubq_s32(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+#[inline]
+pub unsafe fn _mm_cvtepi32_ps(a: int32x4_t) -> float32x4_t {
+    return vcvtq_f32_s32(a);
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+#[inline]
+pub unsafe fn _mm_rcp_ps(i: float32x4_t) -> float32x4_t {
+    let mut recip: float32x4_t = vrecpeq_f32(i);
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, i));
+    // TODO: SSE2NEON_PRECISE_DIV
+    // #if SSE2NEON_PRECISE_DIV
+    //   // Additional Netwon-Raphson iteration for accuracy
+    //   recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+    // #endif
+    return recip;
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+#[inline]
+pub unsafe fn _mm_cvt_ss2si(a: float32x4_t) -> i32 {
+    // #if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    //     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    //   return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+    //                         0);
+    // #else
+    //   float32_t data = vgetq_lane_f32(
+    //       vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    //   return (int32_t)data;
+    // #endif
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(a)), 0);
+}
diff --git a/surge-aarch64/src/imports.rs b/surge-aarch64/src/imports.rs
new file mode 100644
index 00000000..2b6319ac
--- /dev/null
+++ b/surge-aarch64/src/imports.rs
@@ -0,0 +1 @@
+pub(crate) use surge_imports::*;
diff --git a/surge-aarch64/src/lib.rs b/surge-aarch64/src/lib.rs
new file mode 100644
index 00000000..80e1b7fd
--- /dev/null
+++ b/surge-aarch64/src/lib.rs
@@ -0,0 +1,5 @@
+#[macro_use]
+mod imports;
+use imports::*;
+
+x! {aarch64}
diff --git a/surge-imports/src/lib.rs b/surge-imports/src/lib.rs
index b147c4dd..955faeb6 100644
--- a/surge-imports/src/lib.rs
+++ b/surge-imports/src/lib.rs
@@ -1,13 +1,14 @@
 #![allow(unused_imports)]
 
 /*
-#[macro_export] macro_rules! xt { 
-    ($x:ident) => { 
-        #[cfg(test)] mod $x; 
+#[macro_export] macro_rules! xt {
+    ($x:ident) => {
+        #[cfg(test)] mod $x;
     }
 }
 */
 
+#[cfg(target_arch = "x86_64")]
 pub fn assert_m128_eq(a: __m128, b: __m128, epsilon: f32) {
 
     let a_arr: [f32; 4] = unsafe { std::mem::transmute(a) };
@@ -24,9 +25,7 @@ pub fn assert_m128_eq(a: __m128, b: __m128, epsilon: f32) {
     }
 }
 
-#[cfg(target_arch = "x86_64")] 
-pub use core::arch::x86_64::*;
-
+#[cfg(target_arch = "x86_64")]
 pub use core::arch::x86_64::*;
 
 pub use disable_macro::disable;
diff --git a/surge-math/Cargo.toml b/surge-math/Cargo.toml
index d933a6fe..0eea617d 100644
--- a/surge-math/Cargo.toml
+++ b/surge-math/Cargo.toml
@@ -1,5 +1,7 @@
 [dependencies]
 criterion = "0.3"
+[dependencies.surge-aarch64]
+path = "../surge-aarch64"
 [dependencies.surge-errors]
 path = "../surge-errors"
 [dependencies.surge-imports]
@@ -14,4 +16,4 @@ name = "surge-math"
 repository = "https://github.com/klebz/surge-rs"
 version = "0.2.12-alpha.0"
 [package.metadata]
-cpp-authors = [ "Adam Raisbeck <sense@i2pi.com>", "Alec DiAstra <alecdiastra@gmail.com>", "Alexandre Bique <bique.alexandre@gmail.com>", "Amy Furniss <http://github.com/amyfurniss>", "Brian Ginsburg <https://github.com/bgins>", "Cale Gibbard <cgibbard@gmail.com>", "Claes Johanson <claes@bitwig.com>", "Dave Palmer <itsmedavep@gmail.com>", "David Bata <https://github.com/tank-trax>", "David Carlier <devnexen@gmail.com>", "Erik-Jan Maalderink <fonkle@gmx.com>", "Esa Juhani Ruoho <esaruoho@gmail.com>", "Filipe Coelho <falktx@falktx.com>", "Hendrik van Boetzelaer <http://opuswerk.com>", "Jacky Ligon <http://jackyligon.com>", "Jani Frilander <http://github.com/janifr>", "Jarkko Sakkinen <jarkko.sakkinen@iki.fi>", "Jatin Chowdhury <jatin@ccrma.stanford.edu>", "Jean Pierre Cimalando <jp-dev@inbox.ru>", "Jeff Smith <whydoubt@gmail.com>", "Jeremy Sagan <info@sagantech.biz>", "Jon Spruyt <http://jonspru.yt>", "Keith Zantow <kzantow@gmail.com>", "Kjetil Matheussen <k.s.matheussen@notam02.no>", "Korridor <fabian.kempe@gmail.com>", "Luna Langton <https://github.com/selenologist>", "Mario Krušelj a.k.a. EvilDragon <http://github.com/mkruselj>", "Markku Tavasti <tavasti@iki.fi>", "Marty Lake <matthieu@talbot.audio>", "Matthias von Faber <http://github.com/mvf>", "Mikko Saarinki <https://github.com/kingston1>", "Nathan Kopp <nk.sg@nathankopp.com>", "Paul Walker <paul@pwjw.com>", "Rich Elmes <richie@juic3.com>", "Sebastian Schwarzhuber <https://github.com/cannero>", "The Emu (J Riley Hill) <http://jrileyhill.com>", "Vincent Zauhar <v.zauhar@bell.net>", "klebs <none>", "muhl <https://github.com/mynameismuhl>", "簡志瑋 <dppss92132@gmail.com>" ]
\ No newline at end of file
+cpp-authors = [ "Adam Raisbeck <sense@i2pi.com>", "Alec DiAstra <alecdiastra@gmail.com>", "Alexandre Bique <bique.alexandre@gmail.com>", "Amy Furniss <http://github.com/amyfurniss>", "Brian Ginsburg <https://github.com/bgins>", "Cale Gibbard <cgibbard@gmail.com>", "Claes Johanson <claes@bitwig.com>", "Dave Palmer <itsmedavep@gmail.com>", "David Bata <https://github.com/tank-trax>", "David Carlier <devnexen@gmail.com>", "Erik-Jan Maalderink <fonkle@gmx.com>", "Esa Juhani Ruoho <esaruoho@gmail.com>", "Filipe Coelho <falktx@falktx.com>", "Hendrik van Boetzelaer <http://opuswerk.com>", "Jacky Ligon <http://jackyligon.com>", "Jani Frilander <http://github.com/janifr>", "Jarkko Sakkinen <jarkko.sakkinen@iki.fi>", "Jatin Chowdhury <jatin@ccrma.stanford.edu>", "Jean Pierre Cimalando <jp-dev@inbox.ru>", "Jeff Smith <whydoubt@gmail.com>", "Jeremy Sagan <info@sagantech.biz>", "Jon Spruyt <http://jonspru.yt>", "Keith Zantow <kzantow@gmail.com>", "Kjetil Matheussen <k.s.matheussen@notam02.no>", "Korridor <fabian.kempe@gmail.com>", "Luna Langton <https://github.com/selenologist>", "Mario Krušelj a.k.a. EvilDragon <http://github.com/mkruselj>", "Markku Tavasti <tavasti@iki.fi>", "Marty Lake <matthieu@talbot.audio>", "Matthias von Faber <http://github.com/mvf>", "Mikko Saarinki <https://github.com/kingston1>", "Nathan Kopp <nk.sg@nathankopp.com>", "Paul Walker <paul@pwjw.com>", "Rich Elmes <richie@juic3.com>", "Sebastian Schwarzhuber <https://github.com/cannero>", "The Emu (J Riley Hill) <http://jrileyhill.com>", "Vincent Zauhar <v.zauhar@bell.net>", "klebs <none>", "muhl <https://github.com/mynameismuhl>", "簡志瑋 <dppss92132@gmail.com>" ]
diff --git a/surge-math/src/absmax.rs b/surge-math/src/absmax.rs
index 2dda4fba..bfdbf91f 100644
--- a/surge-math/src/absmax.rs
+++ b/surge-math/src/absmax.rs
@@ -1,8 +1,8 @@
 crate::ix!();
 
-#[inline] pub fn abs_ps(x: __m128 ) -> __m128
-{
-    unsafe{ _mm_and_ps(x, m128_mask_absval![]) }
+#[inline]
+pub fn abs_ps(x: __m128) -> __m128 {
+    unsafe { _mm_and_ps(x, m128_mask_absval![]) }
 }
 
 /// Returns the maximum absolute value found in a memory
@@ -17,20 +17,20 @@ crate::ix!();
 /// The number of blocks is determined by the nquads
 /// parameter.
 ///
-#[inline] pub unsafe fn get_absmax(d: *mut f32, nquads: usize) -> f32
-{
+#[cfg(target_arch = "x86_64")]
+#[inline]
+pub unsafe fn get_absmax(d: *mut f32, nquads: usize) -> f32 {
     let mut mx1: __m128 = _mm_setzero_ps();
     let mut mx2: __m128 = _mm_setzero_ps();
 
     for i in (0..nquads).step_by(2) {
-
-        let d_0 = access(d,i);
-        let d_1 = access(d,i+1);
+        let d_0 = access(d, i);
+        let d_1 = access(d, i + 1);
 
         let mask = m128_mask_absval![];
 
-        mx1 = _mm_max_ps( mx1, _mm_and_ps(*d_0, mask));
-        mx2 = _mm_max_ps( mx2, _mm_and_ps(*d_1, mask));
+        mx1 = _mm_max_ps(mx1, _mm_and_ps(*d_0, mask));
+        mx2 = _mm_max_ps(mx2, _mm_and_ps(*d_1, mask));
     }
 
     mx1 = _mm_max_ps(mx1, mx2);
@@ -44,10 +44,11 @@ crate::ix!();
 }
 
 /// Returns the maximum absolute value found in two memory blocks.
-/// 
+///
 /// The `d1` and `d2` pointers are expected to be aligned to a multiple of 16 bytes.
 /// This function processes the input data in blocks of four 32-bit floating-point numbers.
 /// The number of blocks is determined by the `nblocks` parameter.
+#[cfg(target_arch = "x86_64")]
 #[inline]
 pub unsafe fn get_absmax_2(d1: *const f32, d2: *const f32, nblocks: usize) -> f32 {
     assert_eq!((d1 as usize) % 16, 0, "d1 pointer must be aligned");
@@ -74,14 +75,16 @@ pub unsafe fn get_absmax_2(d1: *const f32, d2: *const f32, nblocks: usize) -> f3
 
         max_val = _mm_max_ps(max_val, max_d1_d2); // Update global max
     }
-    
+
     // Horizontal max to reduce all values in `max_val` to a single maximum value
-    let temp1 = _mm_max_ps(max_val, _mm_shuffle_ps(max_val, max_val, _MM_SHUFFLE(2, 3, 0, 1)));
+    let temp1 = _mm_max_ps(
+        max_val,
+        _mm_shuffle_ps(max_val, max_val, _MM_SHUFFLE(2, 3, 0, 1)),
+    );
     let temp2 = _mm_max_ps(temp1, _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(1, 0, 3, 2)));
 
     // Extract the maximum value from the SIMD register
     let result = _mm_cvtss_f32(temp2);
 
-
     result
 }
diff --git a/surge-math/src/get_sign.rs b/surge-math/src/get_sign.rs
index d5350dfe..0ab8daa3 100644
--- a/surge-math/src/get_sign.rs
+++ b/surge-math/src/get_sign.rs
@@ -1,3 +1,5 @@
+use std::arch::asm;
+
 crate::ix!();
 
 /// Returns the sign of an i32 integer as -1 or 1.
@@ -8,11 +10,10 @@ crate::ix!();
 /// On other architectures, the function uses inline
 /// assembly.
 ///
-#[cfg(target_arch = "x86_64")]
-pub fn sign(x: i32) -> i32 
-{
-    match x < 0 { 
-        true  => -1,
+#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+pub fn sign(x: i32) -> i32 {
+    match x < 0 {
+        true => -1,
         false => 1,
     }
 }
@@ -26,7 +27,7 @@ pub fn sign(x: i32) -> i32
 /// Note that inline assembly is only supported on the
 /// nightly Rust.
 ///
-#[cfg(not(target_arch = "x86_64"))]
+#[cfg(all(not(target_arch = "x86_64"), not(target_arch = "aarch64")))]
 pub fn sign(x: i32) -> i32 {
     let result: i32;
     unsafe {
@@ -41,5 +42,3 @@ pub fn sign(x: i32) -> i32 {
     }
     result
 }
-
-
diff --git a/surge-math/src/imports.rs b/surge-math/src/imports.rs
index 62760f34..a22be33b 100644
--- a/surge-math/src/imports.rs
+++ b/surge-math/src/imports.rs
@@ -1,2 +1,5 @@
 pub(crate) use surge_errors::*;
 pub(crate) use surge_imports::*;
+
+#[cfg(target_arch = "aarch64")]
+pub(crate) use surge_aarch64::*;
diff --git a/surge-math/src/lib.rs b/surge-math/src/lib.rs
index 43b3e74f..13a51c8b 100644
--- a/surge-math/src/lib.rs
+++ b/surge-math/src/lib.rs
@@ -3,50 +3,54 @@
 #![allow(unused_imports)]
 #![feature(test)]
 #![feature(core_intrinsics)]
-#![feature(stdarch_x86_mm_shuffle)]
+// #[cfg(target_arch = "x86_64")]
+// #![feature(stdarch_x86_mm_shuffle)]
 #![feature(trait_alias)]
 
-#[macro_use] mod imports; use imports::*;
-#[macro_use] pub mod m128;
+#[macro_use]
+mod imports;
+use imports::*;
+#[macro_use]
+pub mod m128;
 
-x!{align}
-x!{error}
-x!{fill}
-x!{allpass}
-x!{basic_ops}
-x!{clamp}
-x!{clear}
-x!{convert}
-x!{copy}
-x!{correlated_noise}
-x!{encode}
-x!{get_sign}
-x!{absmax}
-x!{access}
-x!{mul}
-x!{sub}
-x!{sum}
-x!{add}
-x!{accum}
-x!{fast}
-x!{flops}
-x!{hardclip}
-x!{interp}
-x!{limit_range}
-x!{ord}
-x!{padding}
-x!{pan}
-x!{saturate}
-x!{scan}
-x!{scratch}
-x!{simd}
-x!{sinc}
-x!{sine}
-x!{softclip}
-x!{square}
-x!{tanh}
-x!{trixpan}
-x!{utility}
-x!{value_cast}
-x!{white_noise}
-x!{window}
+x! {align}
+x! {error}
+x! {fill}
+x! {allpass}
+x! {basic_ops}
+x! {clamp}
+x! {clear}
+x! {convert}
+x! {copy}
+x! {correlated_noise}
+x! {encode}
+x! {get_sign}
+x! {absmax}
+x! {access}
+x! {mul}
+x! {sub}
+x! {sum}
+x! {add}
+x! {accum}
+x! {fast}
+x! {flops}
+x! {hardclip}
+x! {interp}
+x! {limit_range}
+x! {ord}
+x! {padding}
+x! {pan}
+x! {saturate}
+x! {scan}
+x! {scratch}
+x! {simd}
+x! {sinc}
+x! {sine}
+x! {softclip}
+x! {square}
+x! {tanh}
+x! {trixpan}
+x! {utility}
+x! {value_cast}
+x! {white_noise}
+x! {window}
diff --git a/surge-math/src/limit_range.rs b/surge-math/src/limit_range.rs
index c6fecf0a..8da5f3c8 100644
--- a/surge-math/src/limit_range.rs
+++ b/surge-math/src/limit_range.rs
@@ -1,69 +1,61 @@
 crate::ix!();
 
 pub trait LimitRange: Sized {
-
     fn limit_range(self, low: Self, hi: Self) -> Self;
 
     /**
-      | provides a possibility for taking a
-      | non-simd path. if we don't customize it,
-      | the ordinary simd code will be called if it
-      | is used in "limit_range"
-      |
-      */
-    #[inline] fn limit_range_nosimd(self, low: Self, hi: Self) -> Self {
-        self.limit_range(low,hi)
+    | provides a possibility for taking a
+    | non-simd path. if we don't customize it,
+    | the ordinary simd code will be called if it
+    | is used in "limit_range"
+    |
+    */
+    #[inline]
+    fn limit_range_nosimd(self, low: Self, hi: Self) -> Self {
+        self.limit_range(low, hi)
     }
 }
 
 pub fn limit_range<T: LimitRange>(x: T, low: T, high: T) -> T {
-    x.limit_range(low,high)
+    x.limit_range(low, high)
 }
 
 impl LimitRange for f32 {
-
-    #[cfg(target_arch = "x86_64")] #[inline] 
+    // #[cfg(target_arch = "x86_64")]
+    #[inline]
     fn limit_range(self, low: Self, high: Self) -> Self {
-
         let mut result: f32 = 0.0;
 
         unsafe {
-
-            let _s    = _mm_load_ss(&self);
-            let _low  = _mm_load_ss(&low);
+            let _s = _mm_load_ss(&self);
+            let _low = _mm_load_ss(&low);
             let _high = _mm_load_ss(&high);
             let limited = _mm_min_ss(_mm_max_ss(_s, _low), _high);
 
-            _mm_store_ss( &mut result, limited) 
+            _mm_store_ss(&mut result, limited)
         };
 
         result
     }
     fn limit_range_nosimd(self, min: f32, max: f32) -> f32 {
-        std::cmp::max(std::cmp::min(FloatOrd(self),FloatOrd(max)), FloatOrd(min)).0
+        std::cmp::max(std::cmp::min(FloatOrd(self), FloatOrd(max)), FloatOrd(min)).0
     }
 }
 
 impl LimitRange for i32 {
-
-    fn limit_range(self, l: i32, h: i32) -> i32 
-    {
-        std::cmp::max(std::cmp::min(self,h), l)
+    fn limit_range(self, l: i32, h: i32) -> i32 {
+        std::cmp::max(std::cmp::min(self, h), l)
     }
 }
 
 impl LimitRange for u32 {
-    fn limit_range(self, l: u32, h: u32) -> u32 
-    {
-        std::cmp::max(std::cmp::min(self,h), l)
+    fn limit_range(self, l: u32, h: u32) -> u32 {
+        std::cmp::max(std::cmp::min(self, h), l)
     }
 }
 
-
 impl LimitRange for f64 {
-    fn limit_range(self, min:f64, max:f64) -> f64 
-    {
-        std::cmp::max(std::cmp::min(FloatOrd(self),FloatOrd(max)), FloatOrd(min)).0
+    fn limit_range(self, min: f64, max: f64) -> f64 {
+        std::cmp::max(std::cmp::min(FloatOrd(self), FloatOrd(max)), FloatOrd(min)).0
     }
 }
-
diff --git a/surge-math/src/square.rs b/surge-math/src/square.rs
index c9bedcb2..38f1bf06 100644
--- a/surge-math/src/square.rs
+++ b/surge-math/src/square.rs
@@ -1,30 +1,41 @@
 crate::ix!();
 
-#[inline] pub fn v_sqrt_fast(v: __m128) -> __m128 
-{
-    unsafe {
-        _mm_rcp_ps(_mm_rsqrt_ps(v))
-    }
+#[inline]
+pub fn v_sqrt_fast(v: __m128) -> __m128 {
+    unsafe { _mm_rcp_ps(_mm_rsqrt_ps(v)) }
 }
 
-#[inline] pub fn square(x: f64) -> f64 {
+#[inline]
+pub fn square(x: f64) -> f64 {
     x * x
 }
 
-pub fn get_squaremax<NQ>(d: *mut f32, nquads: NQ) -> f32 
-where <NQ as std::convert::TryInto<u32>>::Error: std::fmt::Debug, 
-      NQ: TryInto<u32>
+#[cfg(target_arch = "x86_64")]
+pub fn get_squaremax<NQ>(d: *mut f32, nquads: NQ) -> f32
+where
+    <NQ as std::convert::TryInto<u32>>::Error: std::fmt::Debug,
+    NQ: TryInto<u32>,
 {
     let nquads: u32 = nquads.try_into().unwrap();
     unsafe {
-
         let mut mx1: __m128 = _mm_setzero_ps();
         let mut mx2: __m128 = _mm_setzero_ps();
 
-        for i in (0..nquads).step_by(2)
-        {
-            mx1 = _mm_max_ps(mx1, _mm_mul_ps(*(d as *mut __m128).offset(i as isize), *(d as *mut __m128).offset(i as isize)));
-            mx2 = _mm_max_ps(mx2, _mm_mul_ps(*(d as *mut __m128).offset((i + 1) as isize), *(d as *mut __m128).offset((i + 1) as isize)));
+        for i in (0..nquads).step_by(2) {
+            mx1 = _mm_max_ps(
+                mx1,
+                _mm_mul_ps(
+                    *(d as *mut __m128).offset(i as isize),
+                    *(d as *mut __m128).offset(i as isize),
+                ),
+            );
+            mx2 = _mm_max_ps(
+                mx2,
+                _mm_mul_ps(
+                    *(d as *mut __m128).offset((i + 1) as isize),
+                    *(d as *mut __m128).offset((i + 1) as isize),
+                ),
+            );
         }
 
         mx1 = _mm_max_ps(mx1, mx2);
diff --git a/surge-math/src/sum.rs b/surge-math/src/sum.rs
index d97fc247..f651bfc4 100644
--- a/surge-math/src/sum.rs
+++ b/surge-math/src/sum.rs
@@ -3,24 +3,15 @@ crate::ix!();
 /// Sums the elements of a __m128 vector and returns the
 /// result as a f32 value.
 ///
-#[inline] pub fn v_sum(x: __m128) -> f32 
-{
+#[cfg(target_arch = "x86_64")]
+#[inline]
+pub fn v_sum(x: __m128) -> f32 {
     unsafe {
-
         let mut f: f32 = 0.0;
 
-        let mut a: __m128 = _mm_add_ps(
-            x, 
-            _mm_movehl_ps(x, x)
-        );
+        let mut a: __m128 = _mm_add_ps(x, _mm_movehl_ps(x, x));
 
-        a = _mm_add_ss(
-            a, 
-            _mm_shuffle_ps(
-                a, 
-                a, 
-                _MM_SHUFFLE(0, 0, 0, 1))
-        );
+        a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE!(0, 0, 0, 1)));
 
         _mm_store_ss(&mut f, a);
 
diff --git a/surge-math/src/tanh.rs b/surge-math/src/tanh.rs
index 268f3e70..959a8fb2 100644
--- a/surge-math/src/tanh.rs
+++ b/surge-math/src/tanh.rs
@@ -1,7 +1,7 @@
 crate::ix!();
 
 #[allow(clippy::many_single_char_names)]
-#[cfg(target_arch = "x86_64")] #[inline] 
+#[cfg(target_arch = "x86_64")] #[inline]
 pub fn tanh7_ps(x: __m128) -> __m128
 {
     unsafe {
@@ -17,12 +17,12 @@ pub fn tanh7_ps(x: __m128) -> __m128
         x4 =  _mm_mul_ps(x4, xx)               ;
         y  =  _mm_add_ps(y, _mm_mul_ps(c, x4)) ;
 
-        _mm_mul_ps(y, x) 
+        _mm_mul_ps(y, x)
     }
 }
 
 
-#[cfg(target_arch = "x86_64")] #[inline] 
+#[cfg(target_arch = "x86_64")] #[inline]
 pub fn tanh_fast32(x_in: f32) -> f32 {
 
     assert!(within_range(0.0, x_in, 1.0));
@@ -35,7 +35,7 @@ pub fn tanh_fast32(x_in: f32) -> f32 {
 
     let mut denom: f32 = 1.0 + x + xx + (a * x * xx);
 
-    unsafe{ _mm_store_ss(&mut denom, 
+    unsafe{ _mm_store_ss(&mut denom,
         _mm_rcp_ss(_mm_load_ss(&denom)))};
 
     match x_in > 0.0 {
@@ -45,7 +45,8 @@ pub fn tanh_fast32(x_in: f32) -> f32 {
 }
 
 #[allow(clippy::many_single_char_names)]
-#[inline] pub fn tanh7_ss(x: __m128 ) -> __m128
+#[cfg(target_arch = "x86_64")] #[inline]
+pub fn tanh7_ss(x: __m128 ) -> __m128
 {
     unsafe {
         let a: __m128      = _mm_set1_ps(-1.0 / 3.0);
@@ -63,9 +64,10 @@ pub fn tanh_fast32(x_in: f32) -> f32 {
 }
 
 #[allow(clippy::many_single_char_names)]
-#[inline] pub fn tanh7_f64(x: f64 ) -> f64
+#[cfg(target_arch = "x86_64")] #[inline]
+pub fn tanh7_f64(x: f64 ) -> f64
 {
-    let a: f64 = -1.0 / 3.0; 
+    let a: f64 = -1.0 / 3.0;
     let b: f64 = 2.0 / 15.0;
     let c: f64 = -17.0 / 315.0;
     // return tanh(x);
@@ -84,6 +86,7 @@ pub fn tanh_fast32(x_in: f32) -> f32 {
 }
 
 #[allow(clippy::many_single_char_names)]
+#[cfg(target_arch = "x86_64")]
 pub fn tanh7_block<NQ>(xb: *mut f32, nquads: NQ)
     where <NQ as std::convert::TryInto<u32>>::Error: std::fmt::Debug, NQ: TryInto<u32>
 {
@@ -96,8 +99,8 @@ pub fn tanh7_block<NQ>(xb: *mut f32, nquads: NQ)
         let upper_bound: __m128 = _mm_set1_ps(1.1390);
         let lower_bound: __m128 = _mm_set1_ps(-1.1390);
 
-        let mut t: [__m128; 4]  = [z128![]; 4]; 
-        let mut x: [__m128; 4]  = [z128![]; 4]; 
+        let mut t: [__m128; 4]  = [z128![]; 4];
+        let mut x: [__m128; 4]  = [z128![]; 4];
         let mut xx: [__m128; 4] = [z128![]; 4];
 
         for i  in (0..nquads).step_by(4)
@@ -183,9 +186,9 @@ pub fn tanh7_block<NQ>(xb: *mut f32, nquads: NQ)
     y * x_in
 }
 
-pub fn shafted_tanh(x: f64) -> f64 
+pub fn shafted_tanh(x: f64) -> f64
 {
-    (x.exp() - (-x * 1.2).exp()) / 
+    (x.exp() - (-x * 1.2).exp()) /
         ((x).exp() + (-x).exp())
 }