@@ -933,6 +933,50 @@ impl i16x8 {
933
933
}
934
934
}
935
935
936
+ /// multiplies two i16x8 and returns the result as a widened i32x8
937
+ #[ inline]
938
+ #[ must_use]
939
+ pub fn mul_widen ( self , rhs : Self ) -> i32x8 {
940
+ pick ! {
941
+ if #[ cfg( target_feature="avx2" ) ] {
942
+ let a = convert_to_i32_m256i_from_i16_m128i( self . sse) ;
943
+ let b = convert_to_i32_m256i_from_i16_m128i( rhs. sse) ;
944
+ i32x8 { avx2: mul_i32_keep_low_m256i( a, b) }
945
+ } else if #[ cfg( target_feature="sse2" ) ] {
946
+ let low = mul_i16_keep_low_m128i( self . sse, rhs. sse) ;
947
+ let high = mul_i16_keep_high_m128i( self . sse, rhs. sse) ;
948
+ i32x8 {
949
+ a: i32x4 { sse: unpack_low_i16_m128i( low, high) } ,
950
+ b: i32x4 { sse: unpack_high_i16_m128i( low, high) }
951
+ }
952
+ } else if #[ cfg( all( target_feature="neon" , target_arch="aarch64" ) ) ] {
953
+ let lhs_low = unsafe { vget_low_s16( self . neon) } ;
954
+ let rhs_low = unsafe { vget_low_s16( rhs. neon) } ;
955
+
956
+ let lhs_high = unsafe { vget_high_s16( self . neon) } ;
957
+ let rhs_high = unsafe { vget_high_s16( rhs. neon) } ;
958
+
959
+ let low = unsafe { vmull_s16( lhs_low, rhs_low) } ;
960
+ let high = unsafe { vmull_s16( lhs_high, rhs_high) } ;
961
+
962
+ i32x8 { a: i32x4 { neon: low } , b: i32x4 { neon: high } }
963
+ } else {
964
+ let a = self . as_array_ref( ) ;
965
+ let b = rhs. as_array_ref( ) ;
966
+ i32x8:: new( [
967
+ i32 :: from( a[ 0 ] ) * i32 :: from( b[ 0 ] ) ,
968
+ i32 :: from( a[ 1 ] ) * i32 :: from( b[ 1 ] ) ,
969
+ i32 :: from( a[ 2 ] ) * i32 :: from( b[ 2 ] ) ,
970
+ i32 :: from( a[ 3 ] ) * i32 :: from( b[ 3 ] ) ,
971
+ i32 :: from( a[ 4 ] ) * i32 :: from( b[ 4 ] ) ,
972
+ i32 :: from( a[ 5 ] ) * i32 :: from( b[ 5 ] ) ,
973
+ i32 :: from( a[ 6 ] ) * i32 :: from( b[ 6 ] ) ,
974
+ i32 :: from( a[ 7 ] ) * i32 :: from( b[ 7 ] ) ,
975
+ ] )
976
+ }
977
+ }
978
+ }
979
+
936
980
/// transpose matrix of 8x8 i16 matrix
937
981
#[ must_use]
938
982
#[ inline]
0 commit comments