Skip to content

Commit 86cb2a0

Browse files
authored
Added i16 widening mul since this is implemented badly on some platforms (#153)
* add reduce min/max along with tests. Also optimize i16 abs for sse2 * mulwiden * better mulwiden * better mulwiden * fix test * add must_sue and inline * fix spacing * remove unnecessary paren
1 parent 4b7b5f2 commit 86cb2a0

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

src/i16x8_.rs

+44
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,50 @@ impl i16x8 {
933933
}
934934
}
935935

936+
/// multiplies two i16x8 and returns the result as a widened i32x8
937+
#[inline]
938+
#[must_use]
939+
pub fn mul_widen(self, rhs: Self) -> i32x8 {
940+
pick! {
941+
if #[cfg(target_feature="avx2")] {
942+
let a = convert_to_i32_m256i_from_i16_m128i(self.sse);
943+
let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse);
944+
i32x8 { avx2: mul_i32_keep_low_m256i(a,b) }
945+
} else if #[cfg(target_feature="sse2")] {
946+
let low = mul_i16_keep_low_m128i(self.sse, rhs.sse);
947+
let high = mul_i16_keep_high_m128i(self.sse, rhs.sse);
948+
i32x8 {
949+
a: i32x4 { sse:unpack_low_i16_m128i(low, high) },
950+
b: i32x4 { sse:unpack_high_i16_m128i(low, high) }
951+
}
952+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
953+
let lhs_low = unsafe { vget_low_s16(self.neon) };
954+
let rhs_low = unsafe { vget_low_s16(rhs.neon) };
955+
956+
let lhs_high = unsafe { vget_high_s16(self.neon) };
957+
let rhs_high = unsafe { vget_high_s16(rhs.neon) };
958+
959+
let low = unsafe { vmull_s16(lhs_low, rhs_low) };
960+
let high = unsafe { vmull_s16(lhs_high, rhs_high) };
961+
962+
i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } }
963+
} else {
964+
let a = self.as_array_ref();
965+
let b = rhs.as_array_ref();
966+
i32x8::new([
967+
i32::from(a[0]) * i32::from(b[0]),
968+
i32::from(a[1]) * i32::from(b[1]),
969+
i32::from(a[2]) * i32::from(b[2]),
970+
i32::from(a[3]) * i32::from(b[3]),
971+
i32::from(a[4]) * i32::from(b[4]),
972+
i32::from(a[5]) * i32::from(b[5]),
973+
i32::from(a[6]) * i32::from(b[6]),
974+
i32::from(a[7]) * i32::from(b[7]),
975+
])
976+
}
977+
}
978+
}
979+
936980
/// transpose matrix of 8x8 i16 matrix
937981
#[must_use]
938982
#[inline]

tests/all_tests/t_i16x8.rs

+18
Original file line numberDiff line numberDiff line change
@@ -361,3 +361,21 @@ fn impl_mul_keep_high() {
361361
let c: [i16; 8] = i16x8::mul_keep_high(a, b).into();
362362
assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]);
363363
}
364+
365+
#[test]
366+
fn impl_i16x8_mul_widen() {
367+
let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN, i16::MAX]);
368+
let b = i16x8::from([17, -18, 190, -20, 21, -22, i16::MAX, i16::MAX]);
369+
let expected = i32x8::from([
370+
17,
371+
-36,
372+
570,
373+
-80,
374+
105,
375+
-132,
376+
(i16::MIN as i32) * (i16::MAX as i32),
377+
(i16::MAX as i32) * (i16::MAX as i32),
378+
]);
379+
let actual = a.mul_widen(b);
380+
assert_eq!(expected, actual);
381+
}

0 commit comments

Comments
 (0)